169 lines
7.2 KiB
Plaintext
169 lines
7.2 KiB
Plaintext
import requests
|
|
from bs4 import BeautifulSoup,Tag
|
|
import json
|
|
import re
|
|
import uuid
|
|
|
|
main_page_urls = [
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
|
|
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
|
|
|
|
|
|
]
|
|
|
|
all_d_list = []
|
|
|
|
# 遍历多个主页面的链接
|
|
for main_page_url in main_page_urls:
|
|
response = requests.get(main_page_url)
|
|
html = response.text
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
pattern = re.compile(r'^/journals/differential-and-integral-equations/')
|
|
links = soup.find_all("a", href=pattern)
|
|
sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
|
|
|
|
# 访问子链接并进行信息提取
|
|
for sub_url in sub_urls:
|
|
full_sub_url = "https://projecteuclid.org" + sub_url
|
|
sub_response = requests.get(full_sub_url)
|
|
sub_html = sub_response.text
|
|
|
|
# 执行子界面上的信息提取
|
|
sub_soup = BeautifulSoup(sub_html, "html.parser")
|
|
|
|
#寻找作者
|
|
author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
|
|
|
|
authors = {} # 用于存储作者信息的字典
|
|
|
|
#对每一个的作者信息进行处理
|
|
for i, tag in enumerate(author_tags, 1):
|
|
citation_author = tag['content']
|
|
authors[i] = citation_author if citation_author else None
|
|
|
|
#寻找文章的基本信息
|
|
titles = []
|
|
for title in sub_soup.find_all('meta',{'name':'citation_title'}):
|
|
if title.get('content') is not None:
|
|
titles.append(title.get('content'))
|
|
|
|
|
|
|
|
|
|
#寻找发布时间
|
|
publish_times = []
|
|
for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
|
|
if publish_time.get('content'):
|
|
publish_times.append(str(publish_time.get('content')))
|
|
else:
|
|
publish_time.append('None')
|
|
|
|
|
|
#寻找关键词
|
|
keywords_list=[]
|
|
for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
|
|
if keywords.get('content'):
|
|
keywords_list.append(keywords.get('content'))
|
|
else:
|
|
keywords_list.append('None')
|
|
|
|
|
|
#寻找doi
|
|
dois = []
|
|
for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
|
|
dois.append(str(doi.get('content')))
|
|
doi_separated = ";\n".join(dois)
|
|
|
|
|
|
#寻找volume
|
|
volumes=[]
|
|
for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
|
|
if volume.get('content'):
|
|
volumes.append(volume.get('content'))
|
|
else:
|
|
volumes.append('None')
|
|
volume_separated = ";\n".join(volumes)
|
|
|
|
#寻找issue
|
|
issues=[]
|
|
for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
|
|
|
|
issues.append(issue.get('content'))
|
|
issue_separated = ";\n".join(issues)
|
|
|
|
|
|
#寻找首页
|
|
firstpages=[]
|
|
for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
|
|
firstpages.append(firstpage.get('content'))
|
|
|
|
|
|
#寻找尾页
|
|
lastpages=[]
|
|
for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
|
|
lastpages.append(lastpage.get('content'))
|
|
|
|
#寻找MSC
|
|
MSC=[]
|
|
for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
|
|
MSC.append(msc.get('content'))
|
|
MSC_separated = ";\n".join(MSC)
|
|
|
|
all_d={"article_id:":str(uuid.uuid4()),
|
|
"Author":authors,
|
|
"correspond_author":"null",
|
|
"Title":titles,
|
|
"Publish Time":publish_times,
|
|
"keywords":keywords_list,
|
|
"DOI":doi_separated ,
|
|
"volume":volume_separated ,
|
|
"issue":issue_separated,
|
|
"url":full_sub_url,
|
|
"page": "-".join(firstpages) + "-" + "-".join(lastpages),
|
|
"journal":"projecteuclid.org",
|
|
"MSC":MSC_separated}
|
|
#print(all_d)
|
|
# 写入JSON文件
|
|
|
|
all_d_list.append(all_d)
|
|
|
|
# 将信息存储到列表中
|
|
# all_d_list.append(...)
|
|
|
|
# 输出存储的信息
|
|
# print(all_d_list)
|
|
with open('articles.json', 'w') as f:
|
|
json.dump(all_d_list, f, indent=2)
|
|
|
|
print("JSON文件已成功生成。")
|
|
|