Corrected the loops, the program will now not add the same data repeatedly
This commit is contained in:
parent
01c1a7d978
commit
2fc3b85bab
@ -32,23 +32,27 @@ def Transf():
|
|||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
# Select data
|
# Select data
|
||||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
|
||||||
|
data_oldest.append(Dict)
|
||||||
|
|
||||||
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
|
||||||
|
data_2010_2014.append(Dict)
|
||||||
|
|
||||||
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
|
||||||
|
data_2015_2020.append(Dict)
|
||||||
|
|
||||||
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
|
||||||
|
data_newest.append(Dict)
|
||||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
|
||||||
|
|
||||||
# Transfer
|
# Transfer
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
for index in range(0, 4):
|
for index in range(0, 4):
|
||||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
json.dump(Data[index], file, indent=4)
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|||||||
@ -49,9 +49,10 @@ Article_list = []
|
|||||||
hrefs = []
|
hrefs = []
|
||||||
|
|
||||||
# Base web urls
|
# Base web urls
|
||||||
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
|
|
||||||
current_year = datetime.datetime.now().year
|
current_year = datetime.datetime.now().year
|
||||||
years = range(2009, 2011) # years = range(2010, current_year + 1)
|
years = range(2009, 2011) # years = range(2010, current_year + 1)
|
||||||
|
|
||||||
|
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
|
||||||
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1¶mtipus_ertek=publications¶m_ertek='
|
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1¶mtipus_ertek=publications¶m_ertek='
|
||||||
+ f'{year}' for year in years][::-1]
|
+ f'{year}' for year in years][::-1]
|
||||||
|
|
||||||
|
|||||||
@ -34,23 +34,27 @@ def Transf():
|
|||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
# Select data
|
# 筛选文章
|
||||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
|
||||||
|
data_oldest.append(Dict)
|
||||||
|
|
||||||
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
|
||||||
|
data_2010_2014.append(Dict)
|
||||||
|
|
||||||
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
|
||||||
|
data_2015_2020.append(Dict)
|
||||||
|
|
||||||
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
|
||||||
|
data_newest.append(Dict)
|
||||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
|
||||||
|
|
||||||
# Transfer
|
# Transfer
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
for index in range(0, 4):
|
for index in range(0, 4):
|
||||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
json.dump(Data[index], file, indent=4)
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|||||||
@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list):
|
|||||||
del Author[-1]
|
del Author[-1]
|
||||||
|
|
||||||
# Submit_datetime and publish_datetime
|
# Submit_datetime and publish_datetime
|
||||||
|
def timeSet(time):
|
||||||
|
time = time.split('-')
|
||||||
|
time[1] = time[1].strip('0')
|
||||||
|
time = time[0] + '-' + time[1] + '-' + time[2]
|
||||||
|
return time
|
||||||
|
|
||||||
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
||||||
time = re.findall(r'\d+-\d+-\d+', str(time))
|
time = re.findall(r'\d+-\d+-\d+', str(time))
|
||||||
Submit_date = time[0] if time[0] else None
|
Submit_date = timeSet(time[0]) if time[0] else None
|
||||||
Publish_date = time[1] if time[1] else None
|
Publish_date = timeSet(time[1]) if time[1] else None
|
||||||
|
|
||||||
# Keyword
|
# Keyword
|
||||||
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
||||||
|
|||||||
@ -22,10 +22,6 @@ from urllib.parse import urljoin
|
|||||||
Webs = [] # A list for web url
|
Webs = [] # A list for web url
|
||||||
Links = [] # A list for links
|
Links = [] # A list for links
|
||||||
|
|
||||||
# 存放爬取数据的空列表
|
|
||||||
Article_data = []
|
|
||||||
Author_data = []
|
|
||||||
|
|
||||||
# ==========访问论文列表页==========
|
# ==========访问论文列表页==========
|
||||||
# 获取数学类期刊网站链接
|
# 获取数学类期刊网站链接
|
||||||
headers = SD_link.header()
|
headers = SD_link.header()
|
||||||
@ -60,7 +56,7 @@ for web in Webs:
|
|||||||
print('\nThe links have been stored!\n')
|
print('\nThe links have been stored!\n')
|
||||||
|
|
||||||
# 进入多线程池开始爬取
|
# 进入多线程池开始爬取
|
||||||
SD_threads.Threads(Links, Article_data, Author_data)
|
SD_threads.Threads(Links)
|
||||||
|
|
||||||
# json文件汇总
|
# json文件汇总
|
||||||
SD_save.Transf()
|
SD_save.Transf()
|
||||||
|
|||||||
@ -32,31 +32,34 @@ def Transf():
|
|||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
# 筛选文章
|
# 筛选文章
|
||||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
|
||||||
|
data_oldest.append(Dict)
|
||||||
|
|
||||||
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
|
||||||
|
data_2010_2014.append(Dict)
|
||||||
|
|
||||||
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
|
||||||
|
data_2015_2020.append(Dict)
|
||||||
|
|
||||||
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
|
||||||
|
data_newest.append(Dict)
|
||||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
|
||||||
|
|
||||||
# 转存
|
# 转存
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
for index in range(0, 4):
|
for index in range(0, 4):
|
||||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
json.dump(Data[index], file, indent=4)
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
# 读取路径
|
# 读取路径
|
||||||
|
author_folder_path = './SpringerOpen_buffer/Author_TS'
|
||||||
author_folder_path = './SpringerOpen_buffer/Author'
|
article_folder_path = './SpringerOpen_buffer/Article_TS'
|
||||||
article_folder_path = './SpringerOpen_buffer/Article'
|
|
||||||
|
|
||||||
# 存储路径
|
# 存储路径
|
||||||
author_output_file = [
|
author_output_file = [
|
||||||
@ -80,6 +83,7 @@ def Transf():
|
|||||||
# End
|
# End
|
||||||
print("\nData has been written into files.")
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
|
||||||
# 删除暂存区文件
|
# 删除暂存区文件
|
||||||
def delete(folder_path):
|
def delete(folder_path):
|
||||||
file_names = os.listdir(folder_path)
|
file_names = os.listdir(folder_path)
|
||||||
|
|||||||
@ -8,7 +8,11 @@ import SD_detail
|
|||||||
import SD_save
|
import SD_save
|
||||||
|
|
||||||
# ==========获取论文详情页链接==========
|
# ==========获取论文详情页链接==========
|
||||||
def Scrawl(Link, Article_data, Author_data):
|
def Scrawl(Link):
|
||||||
|
# 存放爬取数据的空列表
|
||||||
|
Article_data = []
|
||||||
|
Author_data = []
|
||||||
|
|
||||||
# 访问论文列表页
|
# 访问论文列表页
|
||||||
headers = SD_link.header()
|
headers = SD_link.header()
|
||||||
soup = SD_link.Link(Link, headers)
|
soup = SD_link.Link(Link, headers)
|
||||||
@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data):
|
|||||||
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
|
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
|
||||||
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
|
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
|
||||||
|
|
||||||
|
print('Complete: ', Link)
|
||||||
|
|
||||||
# 放入json文件暂存(小文件)
|
# 放入json文件暂存(小文件)
|
||||||
if Article_data:
|
if Article_data:
|
||||||
index = str(uuid.uuid4())
|
index = str(uuid.uuid4())
|
||||||
|
|||||||
@ -1,24 +1,19 @@
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||||
import SD_scrawl
|
import SD_scrawl
|
||||||
|
|
||||||
|
|
||||||
# ==========多线程处理==========
|
# ==========多线程处理==========
|
||||||
def Threads(Links, Article_data, Author_data):
|
def Threads(Links):
|
||||||
executor = ThreadPoolExecutor(max_workers=20) # 进程池
|
executor = ThreadPoolExecutor(max_workers=25) # 进程池
|
||||||
|
|
||||||
# 进行多线程处理
|
# 进行多线程处理
|
||||||
futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
|
futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links]
|
||||||
|
|
||||||
# max_iterations = 5 # 最多同时进行数
|
|
||||||
# iteration_count = 0 # 计数器
|
|
||||||
|
|
||||||
# 等待所有进程完成
|
# 等待所有进程完成
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
try:
|
try:
|
||||||
future.result()
|
future.result()
|
||||||
# # 限制最大同时爬取数
|
|
||||||
# iteration_count += 1 # Increment the counter
|
|
||||||
# if iteration_count >= max_iterations:
|
|
||||||
# break
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("An error occurred:", str(e))
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user