2023-07-27 10:26:02 +08:00

169 lines
7.2 KiB
Plaintext

import requests
from bs4 import BeautifulSoup,Tag
import json
import re
import uuid
main_page_urls = [
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
]
all_d_list = []
# 遍历多个主页面的链接
for main_page_url in main_page_urls:
response = requests.get(main_page_url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
pattern = re.compile(r'^/journals/differential-and-integral-equations/')
links = soup.find_all("a", href=pattern)
sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
# 访问子链接并进行信息提取
for sub_url in sub_urls:
full_sub_url = "https://projecteuclid.org" + sub_url
sub_response = requests.get(full_sub_url)
sub_html = sub_response.text
# 执行子界面上的信息提取
sub_soup = BeautifulSoup(sub_html, "html.parser")
#寻找作者
author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
authors = {} # 用于存储作者信息的字典
#对每一个的作者信息进行处理
for i, tag in enumerate(author_tags, 1):
citation_author = tag['content']
authors[i] = citation_author if citation_author else None
#寻找文章的基本信息
titles = []
for title in sub_soup.find_all('meta',{'name':'citation_title'}):
if title.get('content') is not None:
titles.append(title.get('content'))
#寻找发布时间
publish_times = []
for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
if publish_time.get('content'):
publish_times.append(str(publish_time.get('content')))
else:
publish_time.append('None')
#寻找关键词
keywords_list=[]
for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
if keywords.get('content'):
keywords_list.append(keywords.get('content'))
else:
keywords_list.append('None')
#寻找doi
dois = []
for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
dois.append(str(doi.get('content')))
doi_separated = ";\n".join(dois)
#寻找volume
volumes=[]
for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
if volume.get('content'):
volumes.append(volume.get('content'))
else:
volumes.append('None')
volume_separated = ";\n".join(volumes)
#寻找issue
issues=[]
for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
issues.append(issue.get('content'))
issue_separated = ";\n".join(issues)
#寻找首页
firstpages=[]
for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
firstpages.append(firstpage.get('content'))
#寻找尾页
lastpages=[]
for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
lastpages.append(lastpage.get('content'))
#寻找MSC
MSC=[]
for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
MSC.append(msc.get('content'))
MSC_separated = ";\n".join(MSC)
all_d={"article_id:":str(uuid.uuid4()),
"Author":authors,
"correspond_author":"null",
"Title":titles,
"Publish Time":publish_times,
"keywords":keywords_list,
"DOI":doi_separated ,
"volume":volume_separated ,
"issue":issue_separated,
"url":full_sub_url,
"page": "-".join(firstpages) + "-" + "-".join(lastpages),
"journal":"projecteuclid.org",
"MSC":MSC_separated}
#print(all_d)
# 写入JSON文件
all_d_list.append(all_d)
# 将信息存储到列表中
# all_d_list.append(...)
# 输出存储的信息
# print(all_d_list)
with open('articles.json', 'w') as f:
json.dump(all_d_list, f, indent=2)
print("JSON文件已成功生成。")