Corrected the loops, the program will now not add the same data repeatedly

This commit is contained in:
XCX 2023-08-01 19:11:24 +08:00
parent 01c1a7d978
commit 2fc3b85bab
8 changed files with 88 additions and 72 deletions

View File

@ -32,23 +32,27 @@ def Transf():
data = json.load(file) data = json.load(file)
for Dict in data: for Dict in data:
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
# Select data # Select data
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
data_oldest.append(Dict)
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( if (isinstance(Dict, dict) and 2010 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
data_2010_2014.append(Dict)
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( if (isinstance(Dict, dict) and 2015 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
data_2015_2020.append(Dict)
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
data_newest.append(Dict)
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
# Transfer # Transfer
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
for index in range(0, 4): for index in range(0, 4):
with open(output_files[index], 'w', encoding='utf-8') as file: with open(output_files[index], 'w', encoding='utf-8') as file:
json.dump(Data[index], file, indent=4) json.dump(Data[index], file, indent=4)

View File

@ -49,9 +49,10 @@ Article_list = []
hrefs = [] hrefs = []
# Base web urls # Base web urls
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
current_year = datetime.datetime.now().year current_year = datetime.datetime.now().year
years = range(2009, 2011) # years = range(2010, current_year + 1) years = range(2009, 2011) # years = range(2010, current_year + 1)
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek=' url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
+ f'{year}' for year in years][::-1] + f'{year}' for year in years][::-1]

View File

@ -34,23 +34,27 @@ def Transf():
data = json.load(file) data = json.load(file)
for Dict in data: for Dict in data:
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
# Select data # 筛选文章
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
data_oldest.append(Dict)
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( if (isinstance(Dict, dict) and 2010 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
data_2010_2014.append(Dict)
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( if (isinstance(Dict, dict) and 2015 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
data_2015_2020.append(Dict)
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
data_newest.append(Dict)
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
# Transfer # Transfer
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
for index in range(0, 4): for index in range(0, 4):
with open(output_files[index], 'w', encoding='utf-8') as file: with open(output_files[index], 'w', encoding='utf-8') as file:
json.dump(Data[index], file, indent=4) json.dump(Data[index], file, indent=4)

View File

@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list):
del Author[-1] del Author[-1]
# Submit_datetime and publish_datetime # Submit_datetime and publish_datetime
def timeSet(time):
time = time.split('-')
time[1] = time[1].strip('0')
time = time[0] + '-' + time[1] + '-' + time[2]
return time
time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
time = re.findall(r'\d+-\d+-\d+', str(time)) time = re.findall(r'\d+-\d+-\d+', str(time))
Submit_date = time[0] if time[0] else None Submit_date = timeSet(time[0]) if time[0] else None
Publish_date = time[1] if time[1] else None Publish_date = timeSet(time[1]) if time[1] else None
# Keyword # Keyword
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None

View File

@ -22,10 +22,6 @@ from urllib.parse import urljoin
Webs = [] # A list for web url Webs = [] # A list for web url
Links = [] # A list for links Links = [] # A list for links
# 存放爬取数据的空列表
Article_data = []
Author_data = []
# ==========访问论文列表页========== # ==========访问论文列表页==========
# 获取数学类期刊网站链接 # 获取数学类期刊网站链接
headers = SD_link.header() headers = SD_link.header()
@ -60,7 +56,7 @@ for web in Webs:
print('\nThe links have been stored!\n') print('\nThe links have been stored!\n')
# 进入多线程池开始爬取 # 进入多线程池开始爬取
SD_threads.Threads(Links, Article_data, Author_data) SD_threads.Threads(Links)
# json文件汇总 # json文件汇总
SD_save.Transf() SD_save.Transf()

View File

@ -32,31 +32,34 @@ def Transf():
data = json.load(file) data = json.load(file)
for Dict in data: for Dict in data:
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
# 筛选文章 # 筛选文章
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
data_oldest.append(Dict)
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( if (isinstance(Dict, dict) and 2010 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
data_2010_2014.append(Dict)
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( if (isinstance(Dict, dict) and 2015 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
data_2015_2020.append(Dict)
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
data_newest.append(Dict)
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
# 转存 # 转存
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
for index in range(0, 4): for index in range(0, 4):
with open(output_files[index], 'w', encoding='utf-8') as file: with open(output_files[index], 'w', encoding='utf-8') as file:
json.dump(Data[index], file, indent=4) json.dump(Data[index], file, indent=4)
# 读取路径 # 读取路径
author_folder_path = './SpringerOpen_buffer/Author_TS'
author_folder_path = './SpringerOpen_buffer/Author' article_folder_path = './SpringerOpen_buffer/Article_TS'
article_folder_path = './SpringerOpen_buffer/Article'
# 存储路径 # 存储路径
author_output_file = [ author_output_file = [
@ -80,6 +83,7 @@ def Transf():
# End # End
print("\nData has been written into files.") print("\nData has been written into files.")
# 删除暂存区文件 # 删除暂存区文件
def delete(folder_path): def delete(folder_path):
file_names = os.listdir(folder_path) file_names = os.listdir(folder_path)

View File

@ -8,7 +8,11 @@ import SD_detail
import SD_save import SD_save
# ==========获取论文详情页链接========== # ==========获取论文详情页链接==========
def Scrawl(Link, Article_data, Author_data): def Scrawl(Link):
# 存放爬取数据的空列表
Article_data = []
Author_data = []
# 访问论文列表页 # 访问论文列表页
headers = SD_link.header() headers = SD_link.header()
soup = SD_link.Link(Link, headers) soup = SD_link.Link(Link, headers)
@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data):
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id)) Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data) Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
print('Complete: ', Link)
# 放入json文件暂存小文件 # 放入json文件暂存小文件
if Article_data: if Article_data:
index = str(uuid.uuid4()) index = str(uuid.uuid4())

View File

@ -1,24 +1,19 @@
from concurrent.futures import ThreadPoolExecutor, as_completed, wait from concurrent.futures import ThreadPoolExecutor, as_completed, wait
import SD_scrawl import SD_scrawl
# ==========多线程处理========== # ==========多线程处理==========
def Threads(Links, Article_data, Author_data): def Threads(Links):
executor = ThreadPoolExecutor(max_workers=20) # 进程池 executor = ThreadPoolExecutor(max_workers=25) # 进程池
# 进行多线程处理 # 进行多线程处理
futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links] futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links]
# max_iterations = 5 # 最多同时进行数
# iteration_count = 0 # 计数器
# 等待所有进程完成 # 等待所有进程完成
for future in as_completed(futures): for future in as_completed(futures):
try: try:
future.result() future.result()
# # 限制最大同时爬取数
# iteration_count += 1 # Increment the counter
# if iteration_count >= max_iterations:
# break
except Exception as e: except Exception as e:
print("An error occurred:", str(e)) print("An error occurred:", str(e))