Modified old code
This commit is contained in:
parent
cfa9345a79
commit
26fed37e17
@ -1,38 +0,0 @@
|
|||||||
import os
|
|
||||||
import json
|
|
||||||
|
|
||||||
# Function
|
|
||||||
# Get the data from input files
|
|
||||||
def Read(folder_path):
|
|
||||||
data = []
|
|
||||||
|
|
||||||
for filename in os.listdir(folder_path):
|
|
||||||
if filename.endswith('.json'):
|
|
||||||
file_path = os.path.join(folder_path, filename)
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
|
||||||
data.extend(json.load(file))
|
|
||||||
return data
|
|
||||||
|
|
||||||
# Write into output files
|
|
||||||
def Write(data, output_file):
|
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
|
||||||
json.dump(data, file, indent=4)
|
|
||||||
|
|
||||||
# Path of files need to be read
|
|
||||||
folder_path1 = '.\ejde_buffer\Author'
|
|
||||||
folder_path2 = '.\ejde_buffer\Article'
|
|
||||||
|
|
||||||
# Read the data in the files
|
|
||||||
Author_data = Read(folder_path1)
|
|
||||||
Article_data = Read(folder_path2)
|
|
||||||
|
|
||||||
# The path of output files
|
|
||||||
output_file1 = '.\ejde_buffer\Author_output_file.json'
|
|
||||||
output_file2 = '.\ejde_buffer\Article_output_file.json'
|
|
||||||
|
|
||||||
# Write into files
|
|
||||||
Write(Author_data, output_file1)
|
|
||||||
Write(Article_data, output_file2)
|
|
||||||
|
|
||||||
# End
|
|
||||||
print("\nData has been written into files.")
|
|
||||||
@ -1,23 +1,22 @@
|
|||||||
import os
|
|
||||||
import uuid
|
import uuid
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
import re
|
||||||
import json
|
import ejde_save
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
'''
|
||||||
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
def save_data(dataset, filetype, filename):
|
==========运行顺序==========
|
||||||
if dataset:
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
directory = "./ejde_buffer/" + filetype + "/"
|
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||||
os.makedirs(directory, exist_ok=True)
|
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||||
filepath = os.path.join(directory, filename)
|
'''
|
||||||
with open(filepath, "w", encoding='utf-8') as json_file:
|
|
||||||
json.dump(dataset, json_file, indent=4)
|
|
||||||
print(filetype + " data have been added to", filepath)
|
|
||||||
|
|
||||||
|
|
||||||
|
# Article and author detail
|
||||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||||
def process_article(url):
|
def process_article(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
@ -43,7 +42,7 @@ def process_article(url):
|
|||||||
|
|
||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||||
volume = volume_match.group(1) if volume_match else None
|
volume = str(volume_match.group(1)) if volume_match else None
|
||||||
# year = volume_match.group(2) if volume_match else None
|
# year = volume_match.group(2) if volume_match else None
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
@ -141,11 +140,11 @@ def process_article(url):
|
|||||||
|
|
||||||
# Save the data periodically based on batch size
|
# Save the data periodically based on batch size
|
||||||
if len(articleData) % batch_size == 0:
|
if len(articleData) % batch_size == 0:
|
||||||
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
articleData.clear()
|
articleData.clear()
|
||||||
|
|
||||||
if len(authorData) % batch_size == 0:
|
if len(authorData) % batch_size == 0:
|
||||||
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
authorData.clear()
|
authorData.clear()
|
||||||
|
|
||||||
|
|
||||||
@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
|
|||||||
authorData = []
|
authorData = []
|
||||||
articleData = []
|
articleData = []
|
||||||
|
|
||||||
batch_size = 500 # Number of articles to process before saving
|
batch_size = 5 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
@ -176,10 +175,14 @@ for future in as_completed(futures):
|
|||||||
print("An error occurred:", str(e))
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
# Save remaining data
|
# Save remaining data
|
||||||
if articleData:
|
if len(articleData) > 0:
|
||||||
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||||
|
|
||||||
if authorData:
|
if len(authorData) > 0:
|
||||||
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||||
|
|
||||||
|
# Transfer to large file and delete the temporary storage files
|
||||||
|
ejde_save.Transf()
|
||||||
|
ejde_save.delete()
|
||||||
93
EJDE_spider/ejde_save.py
Normal file
93
EJDE_spider/ejde_save.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# Save data
|
||||||
|
def save_data(dataset, filetype, filename):
|
||||||
|
if dataset:
|
||||||
|
directory = "./ejde_buffer/" + filetype + "/"
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, "w", encoding='utf-8') as json_file:
|
||||||
|
json.dump(dataset, json_file, indent=4)
|
||||||
|
print(filetype + " data have been added to", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
# Write into output files
|
||||||
|
def Transf():
|
||||||
|
def Read(folder_path, output_files):
|
||||||
|
# Create new folders
|
||||||
|
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
||||||
|
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
data_oldest = []
|
||||||
|
data_2010_2014 = []
|
||||||
|
data_2015_2020 = []
|
||||||
|
data_newest = []
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
|
# Select data
|
||||||
|
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
|
|
||||||
|
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||||
|
|
||||||
|
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||||
|
|
||||||
|
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||||
|
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
|
# Transfer
|
||||||
|
for index in range(0, 4):
|
||||||
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
|
# The path of reading
|
||||||
|
author_folder_path = './ejde_buffer/Author_TS'
|
||||||
|
article_folder_path = './ejde_buffer/Article_TS'
|
||||||
|
|
||||||
|
# The path of storage
|
||||||
|
author_output_file = [
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(oldest).json',
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(2010-2014).json',
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(2015-2020).json',
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
article_output_file = [
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(oldest).json',
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(2010-2014).json',
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(2015-2020).json',
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Read and write into files
|
||||||
|
Read(author_folder_path, author_output_file)
|
||||||
|
Read(article_folder_path, article_output_file)
|
||||||
|
|
||||||
|
# End
|
||||||
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
|
||||||
|
# Delete files in temporary storage area
|
||||||
|
def delete():
|
||||||
|
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
||||||
|
for folder_path in folder_paths:
|
||||||
|
file_names = os.listdir(folder_path)
|
||||||
|
for file_name in file_names:
|
||||||
|
file_path = os.path.join(folder_path, file_name)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
@ -12,10 +12,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
|||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
'''
|
'''
|
||||||
爬取网站:'https://www.math.u-szeged.hu/ejqtde
|
爬取网站:'https://www.math.u-szeged.hu/ejqtde'
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejqtde_href_multithread 获取各年份的期刊链接
|
1、ejqtde_main 获取各年份的期刊链接
|
||||||
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
|
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
|
||||||
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||||
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||||
@ -70,6 +70,7 @@ with ThreadPoolExecutor(max_workers=25) as executor:
|
|||||||
wait(futures)
|
wait(futures)
|
||||||
print('\nAll links have been got.\n')
|
print('\nAll links have been got.\n')
|
||||||
|
|
||||||
|
# Use multithreading to get the data
|
||||||
count1 = 0
|
count1 = 0
|
||||||
count2 = 0
|
count2 = 0
|
||||||
locks = threading.Lock()
|
locks = threading.Lock()
|
||||||
@ -107,5 +108,6 @@ print('\nThe whole scrawler program has been done\n')
|
|||||||
print(count1, ' article_data has been stored.')
|
print(count1, ' article_data has been stored.')
|
||||||
print(count2, ' author_data has been stored.')
|
print(count2, ' author_data has been stored.')
|
||||||
|
|
||||||
|
# Transfer to large file and delete the temporary storage files
|
||||||
ejqtde_save.Transf()
|
ejqtde_save.Transf()
|
||||||
ejqtde_save.delete()
|
ejqtde_save.delete()
|
||||||
@ -18,7 +18,7 @@ def save_data(dataset, filetype):
|
|||||||
# Summary files
|
# Summary files
|
||||||
def Transf():
|
def Transf():
|
||||||
def Read(folder_path, output_files):
|
def Read(folder_path, output_files):
|
||||||
# 新建文件夹
|
# Create new folder
|
||||||
os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
|
os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
|
||||||
os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
|
os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
@ -33,6 +33,8 @@ def Transf():
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
# Select data
|
# Select data
|
||||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
@ -91,3 +93,4 @@ def delete():
|
|||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
|
|
||||||
print('\nAttention: The temporary storage files have been deleted!')
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
|
|
||||||
|
|||||||
168
ProjectEuclid_spider/projecteuclid_main
Normal file
168
ProjectEuclid_spider/projecteuclid_main
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup,Tag
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
main_page_urls = [
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
|
||||||
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
all_d_list = []
|
||||||
|
|
||||||
|
# 遍历多个主页面的链接
|
||||||
|
for main_page_url in main_page_urls:
|
||||||
|
response = requests.get(main_page_url)
|
||||||
|
html = response.text
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
pattern = re.compile(r'^/journals/differential-and-integral-equations/')
|
||||||
|
links = soup.find_all("a", href=pattern)
|
||||||
|
sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
|
||||||
|
|
||||||
|
# 访问子链接并进行信息提取
|
||||||
|
for sub_url in sub_urls:
|
||||||
|
full_sub_url = "https://projecteuclid.org" + sub_url
|
||||||
|
sub_response = requests.get(full_sub_url)
|
||||||
|
sub_html = sub_response.text
|
||||||
|
|
||||||
|
# 执行子界面上的信息提取
|
||||||
|
sub_soup = BeautifulSoup(sub_html, "html.parser")
|
||||||
|
|
||||||
|
#寻找作者
|
||||||
|
author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
|
||||||
|
|
||||||
|
authors = {} # 用于存储作者信息的字典
|
||||||
|
|
||||||
|
#对每一个的作者信息进行处理
|
||||||
|
for i, tag in enumerate(author_tags, 1):
|
||||||
|
citation_author = tag['content']
|
||||||
|
authors[i] = citation_author if citation_author else None
|
||||||
|
|
||||||
|
#寻找文章的基本信息
|
||||||
|
titles = []
|
||||||
|
for title in sub_soup.find_all('meta',{'name':'citation_title'}):
|
||||||
|
if title.get('content') is not None:
|
||||||
|
titles.append(title.get('content'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#寻找发布时间
|
||||||
|
publish_times = []
|
||||||
|
for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
|
||||||
|
if publish_time.get('content'):
|
||||||
|
publish_times.append(str(publish_time.get('content')))
|
||||||
|
else:
|
||||||
|
publish_time.append('None')
|
||||||
|
|
||||||
|
|
||||||
|
#寻找关键词
|
||||||
|
keywords_list=[]
|
||||||
|
for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
|
||||||
|
if keywords.get('content'):
|
||||||
|
keywords_list.append(keywords.get('content'))
|
||||||
|
else:
|
||||||
|
keywords_list.append('None')
|
||||||
|
|
||||||
|
|
||||||
|
#寻找doi
|
||||||
|
dois = []
|
||||||
|
for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
|
||||||
|
dois.append(str(doi.get('content')))
|
||||||
|
doi_separated = ";\n".join(dois)
|
||||||
|
|
||||||
|
|
||||||
|
#寻找volume
|
||||||
|
volumes=[]
|
||||||
|
for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
|
||||||
|
if volume.get('content'):
|
||||||
|
volumes.append(volume.get('content'))
|
||||||
|
else:
|
||||||
|
volumes.append('None')
|
||||||
|
volume_separated = ";\n".join(volumes)
|
||||||
|
|
||||||
|
#寻找issue
|
||||||
|
issues=[]
|
||||||
|
for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
|
||||||
|
|
||||||
|
issues.append(issue.get('content'))
|
||||||
|
issue_separated = ";\n".join(issues)
|
||||||
|
|
||||||
|
|
||||||
|
#寻找首页
|
||||||
|
firstpages=[]
|
||||||
|
for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
|
||||||
|
firstpages.append(firstpage.get('content'))
|
||||||
|
|
||||||
|
|
||||||
|
#寻找尾页
|
||||||
|
lastpages=[]
|
||||||
|
for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
|
||||||
|
lastpages.append(lastpage.get('content'))
|
||||||
|
|
||||||
|
#寻找MSC
|
||||||
|
MSC=[]
|
||||||
|
for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
|
||||||
|
MSC.append(msc.get('content'))
|
||||||
|
MSC_separated = ";\n".join(MSC)
|
||||||
|
|
||||||
|
all_d={"article_id:":str(uuid.uuid4()),
|
||||||
|
"Author":authors,
|
||||||
|
"correspond_author":"null",
|
||||||
|
"Title":titles,
|
||||||
|
"Publish Time":publish_times,
|
||||||
|
"keywords":keywords_list,
|
||||||
|
"DOI":doi_separated ,
|
||||||
|
"volume":volume_separated ,
|
||||||
|
"issue":issue_separated,
|
||||||
|
"url":full_sub_url,
|
||||||
|
"page": "-".join(firstpages) + "-" + "-".join(lastpages),
|
||||||
|
"journal":"projecteuclid.org",
|
||||||
|
"MSC":MSC_separated}
|
||||||
|
#print(all_d)
|
||||||
|
# 写入JSON文件
|
||||||
|
|
||||||
|
all_d_list.append(all_d)
|
||||||
|
|
||||||
|
# 将信息存储到列表中
|
||||||
|
# all_d_list.append(...)
|
||||||
|
|
||||||
|
# 输出存储的信息
|
||||||
|
# print(all_d_list)
|
||||||
|
with open('articles.json', 'w') as f:
|
||||||
|
json.dump(all_d_list, f, indent=2)
|
||||||
|
|
||||||
|
print("JSON文件已成功生成。")
|
||||||
|
|
||||||
@ -31,6 +31,8 @@ def Transf():
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
# 筛选文章
|
# 筛选文章
|
||||||
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user