Corrected the loops, the program will now not add the same data repeatedly
This commit is contained in:
parent
01c1a7d978
commit
2fc3b85bab
@ -31,27 +31,31 @@ def Transf():
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
for Dict in data:
|
||||
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
# Select data
|
||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||
for Dict in data:
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
# Select data
|
||||
if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
|
||||
data_oldest.append(Dict)
|
||||
|
||||
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||
if (isinstance(Dict, dict) and 2010 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
|
||||
data_2010_2014.append(Dict)
|
||||
|
||||
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||
if (isinstance(Dict, dict) and 2015 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
|
||||
data_2015_2020.append(Dict)
|
||||
|
||||
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||
if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
|
||||
data_newest.append(Dict)
|
||||
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
# Transfer
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
|
||||
# Transfer
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
|
||||
# The path of reading
|
||||
author_folder_path = './ejde_buffer/Author_TS'
|
||||
|
||||
@ -15,10 +15,10 @@ from urllib.parse import urljoin
|
||||
爬取网站:'https://www.math.u-szeged.hu/ejqtde'
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejqtde_main 获取各年份的期刊链接
|
||||
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
|
||||
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||
1、ejqtde_main 获取各年份的期刊链接
|
||||
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
|
||||
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||
'''
|
||||
|
||||
|
||||
@ -49,9 +49,10 @@ Article_list = []
|
||||
hrefs = []
|
||||
|
||||
# Base web urls
|
||||
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
|
||||
current_year = datetime.datetime.now().year
|
||||
years = range(2009, 2011) # years = range(2010, current_year + 1)
|
||||
|
||||
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
|
||||
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1¶mtipus_ertek=publications¶m_ertek='
|
||||
+ f'{year}' for year in years][::-1]
|
||||
|
||||
|
||||
@ -34,26 +34,30 @@ def Transf():
|
||||
data = json.load(file)
|
||||
|
||||
for Dict in data:
|
||||
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
# Select data
|
||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
# 筛选文章
|
||||
if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
|
||||
data_oldest.append(Dict)
|
||||
|
||||
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||
if (isinstance(Dict, dict) and 2010 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
|
||||
data_2010_2014.append(Dict)
|
||||
|
||||
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||
if (isinstance(Dict, dict) and 2015 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
|
||||
data_2015_2020.append(Dict)
|
||||
|
||||
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||
if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
|
||||
data_newest.append(Dict)
|
||||
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
# Transfer
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
|
||||
# Transfer
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
|
||||
# The path of reading
|
||||
author_folder_path = './EJQTDE_buffer/Author_TS'
|
||||
|
||||
@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list):
|
||||
del Author[-1]
|
||||
|
||||
# Submit_datetime and publish_datetime
|
||||
def timeSet(time):
|
||||
time = time.split('-')
|
||||
time[1] = time[1].strip('0')
|
||||
time = time[0] + '-' + time[1] + '-' + time[2]
|
||||
return time
|
||||
|
||||
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
||||
time = re.findall(r'\d+-\d+-\d+', str(time))
|
||||
Submit_date = time[0] if time[0] else None
|
||||
Publish_date = time[1] if time[1] else None
|
||||
Submit_date = timeSet(time[0]) if time[0] else None
|
||||
Publish_date = timeSet(time[1]) if time[1] else None
|
||||
|
||||
# Keyword
|
||||
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
||||
|
||||
@ -22,10 +22,6 @@ from urllib.parse import urljoin
|
||||
Webs = [] # A list for web url
|
||||
Links = [] # A list for links
|
||||
|
||||
# 存放爬取数据的空列表
|
||||
Article_data = []
|
||||
Author_data = []
|
||||
|
||||
# ==========访问论文列表页==========
|
||||
# 获取数学类期刊网站链接
|
||||
headers = SD_link.header()
|
||||
@ -60,7 +56,7 @@ for web in Webs:
|
||||
print('\nThe links have been stored!\n')
|
||||
|
||||
# 进入多线程池开始爬取
|
||||
SD_threads.Threads(Links, Article_data, Author_data)
|
||||
SD_threads.Threads(Links)
|
||||
|
||||
# json文件汇总
|
||||
SD_save.Transf()
|
||||
|
||||
@ -32,31 +32,34 @@ def Transf():
|
||||
data = json.load(file)
|
||||
|
||||
for Dict in data:
|
||||
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
# 筛选文章
|
||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||
if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
|
||||
data_oldest.append(Dict)
|
||||
|
||||
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||
if (isinstance(Dict, dict) and 2010 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
|
||||
data_2010_2014.append(Dict)
|
||||
|
||||
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||
if (isinstance(Dict, dict) and 2015 <= int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
|
||||
data_2015_2020.append(Dict)
|
||||
|
||||
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||
if (isinstance(Dict, dict) and int(
|
||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
|
||||
data_newest.append(Dict)
|
||||
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
# 转存
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
|
||||
# 转存
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
|
||||
# 读取路径
|
||||
|
||||
author_folder_path = './SpringerOpen_buffer/Author'
|
||||
article_folder_path = './SpringerOpen_buffer/Article'
|
||||
# 读取路径
|
||||
author_folder_path = './SpringerOpen_buffer/Author_TS'
|
||||
article_folder_path = './SpringerOpen_buffer/Article_TS'
|
||||
|
||||
# 存储路径
|
||||
author_output_file = [
|
||||
@ -80,6 +83,7 @@ def Transf():
|
||||
# End
|
||||
print("\nData has been written into files.")
|
||||
|
||||
|
||||
# 删除暂存区文件
|
||||
def delete(folder_path):
|
||||
file_names = os.listdir(folder_path)
|
||||
|
||||
@ -8,7 +8,11 @@ import SD_detail
|
||||
import SD_save
|
||||
|
||||
# ==========获取论文详情页链接==========
|
||||
def Scrawl(Link, Article_data, Author_data):
|
||||
def Scrawl(Link):
|
||||
# 存放爬取数据的空列表
|
||||
Article_data = []
|
||||
Author_data = []
|
||||
|
||||
# 访问论文列表页
|
||||
headers = SD_link.header()
|
||||
soup = SD_link.Link(Link, headers)
|
||||
@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data):
|
||||
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
|
||||
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
|
||||
|
||||
print('Complete: ', Link)
|
||||
|
||||
# 放入json文件暂存(小文件)
|
||||
if Article_data:
|
||||
index = str(uuid.uuid4())
|
||||
|
||||
@ -1,24 +1,19 @@
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||
import SD_scrawl
|
||||
|
||||
|
||||
# ==========多线程处理==========
|
||||
def Threads(Links, Article_data, Author_data):
|
||||
executor = ThreadPoolExecutor(max_workers=20) # 进程池
|
||||
def Threads(Links):
|
||||
executor = ThreadPoolExecutor(max_workers=25) # 进程池
|
||||
|
||||
# 进行多线程处理
|
||||
futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
|
||||
|
||||
# max_iterations = 5 # 最多同时进行数
|
||||
# iteration_count = 0 # 计数器
|
||||
futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links]
|
||||
|
||||
# 等待所有进程完成
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
# # 限制最大同时爬取数
|
||||
# iteration_count += 1 # Increment the counter
|
||||
# if iteration_count >= max_iterations:
|
||||
# break
|
||||
|
||||
except Exception as e:
|
||||
print("An error occurred:", str(e))
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user