删除文件
该文件已经移动至其他文件夹ProjectEuclid_spider,并且本地已经备份原文件 Signed-off-by: XCX <xcx@jack@ecwuuuuu.com>
This commit is contained in:
parent
26fed37e17
commit
07c334a903
@ -1,168 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup,Tag
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
main_page_urls = [
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
|
|
||||||
"https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
|
|
||||||
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
all_d_list = []
|
|
||||||
|
|
||||||
# 遍历多个主页面的链接
|
|
||||||
for main_page_url in main_page_urls:
|
|
||||||
response = requests.get(main_page_url)
|
|
||||||
html = response.text
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
|
|
||||||
pattern = re.compile(r'^/journals/differential-and-integral-equations/')
|
|
||||||
links = soup.find_all("a", href=pattern)
|
|
||||||
sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
|
|
||||||
|
|
||||||
# 访问子链接并进行信息提取
|
|
||||||
for sub_url in sub_urls:
|
|
||||||
full_sub_url = "https://projecteuclid.org" + sub_url
|
|
||||||
sub_response = requests.get(full_sub_url)
|
|
||||||
sub_html = sub_response.text
|
|
||||||
|
|
||||||
# 执行子界面上的信息提取
|
|
||||||
sub_soup = BeautifulSoup(sub_html, "html.parser")
|
|
||||||
|
|
||||||
#寻找作者
|
|
||||||
author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
|
|
||||||
|
|
||||||
authors = {} # 用于存储作者信息的字典
|
|
||||||
|
|
||||||
#对每一个的作者信息进行处理
|
|
||||||
for i, tag in enumerate(author_tags, 1):
|
|
||||||
citation_author = tag['content']
|
|
||||||
authors[i] = citation_author if citation_author else None
|
|
||||||
|
|
||||||
#寻找文章的基本信息
|
|
||||||
titles = []
|
|
||||||
for title in sub_soup.find_all('meta',{'name':'citation_title'}):
|
|
||||||
if title.get('content') is not None:
|
|
||||||
titles.append(title.get('content'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#寻找发布时间
|
|
||||||
publish_times = []
|
|
||||||
for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
|
|
||||||
if publish_time.get('content'):
|
|
||||||
publish_times.append(str(publish_time.get('content')))
|
|
||||||
else:
|
|
||||||
publish_time.append('None')
|
|
||||||
|
|
||||||
|
|
||||||
#寻找关键词
|
|
||||||
keywords_list=[]
|
|
||||||
for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
|
|
||||||
if keywords.get('content'):
|
|
||||||
keywords_list.append(keywords.get('content'))
|
|
||||||
else:
|
|
||||||
keywords_list.append('None')
|
|
||||||
|
|
||||||
|
|
||||||
#寻找doi
|
|
||||||
dois = []
|
|
||||||
for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
|
|
||||||
dois.append(str(doi.get('content')))
|
|
||||||
doi_separated = ";\n".join(dois)
|
|
||||||
|
|
||||||
|
|
||||||
#寻找volume
|
|
||||||
volumes=[]
|
|
||||||
for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
|
|
||||||
if volume.get('content'):
|
|
||||||
volumes.append(volume.get('content'))
|
|
||||||
else:
|
|
||||||
volumes.append('None')
|
|
||||||
volume_separated = ";\n".join(volumes)
|
|
||||||
|
|
||||||
#寻找issue
|
|
||||||
issues=[]
|
|
||||||
for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
|
|
||||||
|
|
||||||
issues.append(issue.get('content'))
|
|
||||||
issue_separated = ";\n".join(issues)
|
|
||||||
|
|
||||||
|
|
||||||
#寻找首页
|
|
||||||
firstpages=[]
|
|
||||||
for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
|
|
||||||
firstpages.append(firstpage.get('content'))
|
|
||||||
|
|
||||||
|
|
||||||
#寻找尾页
|
|
||||||
lastpages=[]
|
|
||||||
for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
|
|
||||||
lastpages.append(lastpage.get('content'))
|
|
||||||
|
|
||||||
#寻找MSC
|
|
||||||
MSC=[]
|
|
||||||
for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
|
|
||||||
MSC.append(msc.get('content'))
|
|
||||||
MSC_separated = ";\n".join(MSC)
|
|
||||||
|
|
||||||
all_d={"article_id:":str(uuid.uuid4()),
|
|
||||||
"Author":authors,
|
|
||||||
"correspond_author":"null",
|
|
||||||
"Title":titles,
|
|
||||||
"Publish Time":publish_times,
|
|
||||||
"keywords":keywords_list,
|
|
||||||
"DOI":doi_separated ,
|
|
||||||
"volume":volume_separated ,
|
|
||||||
"issue":issue_separated,
|
|
||||||
"url":full_sub_url,
|
|
||||||
"page": "-".join(firstpages) + "-" + "-".join(lastpages),
|
|
||||||
"journal":"projecteuclid.org",
|
|
||||||
"MSC":MSC_separated}
|
|
||||||
#print(all_d)
|
|
||||||
# 写入JSON文件
|
|
||||||
|
|
||||||
all_d_list.append(all_d)
|
|
||||||
|
|
||||||
# 将信息存储到列表中
|
|
||||||
# all_d_list.append(...)
|
|
||||||
|
|
||||||
# 输出存储的信息
|
|
||||||
# print(all_d_list)
|
|
||||||
with open('articles.json', 'w') as f:
|
|
||||||
json.dump(all_d_list, f, indent=2)
|
|
||||||
|
|
||||||
print("JSON文件已成功生成。")
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user