From b2c845dc6e966e8b128a241de04854dcc1e878e8 Mon Sep 17 00:00:00 2001 From: SHL Date: Fri, 14 Jul 2023 20:47:47 +0800 Subject: [PATCH] the code of projecteuclid_spider --- projecteuclid_spider | 168 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 projecteuclid_spider diff --git a/projecteuclid_spider b/projecteuclid_spider new file mode 100644 index 0000000..9ca21b1 --- /dev/null +++ b/projecteuclid_spider @@ -0,0 +1,168 @@ +import requests +from bs4 import BeautifulSoup,Tag +import json +import re +import uuid + +main_page_urls = [ + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2" + + +] + +all_d_list = [] + +# 遍历多个主页面的链接 +for main_page_url in main_page_urls: + response = requests.get(main_page_url) + html = response.text + soup = BeautifulSoup(html, "html.parser") + + pattern = re.compile(r'^/journals/differential-and-integral-equations/') + links = soup.find_all("a", href=pattern) + sub_urls = [link["href"] for link in links if link["href"].endswith(".full")] + + # 访问子链接并进行信息提取 + for sub_url in sub_urls: + full_sub_url = "https://projecteuclid.org" + sub_url + sub_response = requests.get(full_sub_url) + sub_html = sub_response.text + + # 执行子界面上的信息提取 + sub_soup = BeautifulSoup(sub_html, "html.parser") + + #寻找作者 + author_tags = sub_soup.find_all('meta', {'name': 'citation_author'}) + + authors = {} # 用于存储作者信息的字典 + + #对每一个的作者信息进行处理 + for i, tag in enumerate(author_tags, 1): + citation_author = tag['content'] + authors[i] = citation_author if citation_author else None + + #寻找文章的基本信息 + titles = [] + for title in sub_soup.find_all('meta',{'name':'citation_title'}): + if title.get('content') is not None: + titles.append(title.get('content')) + + + + + #寻找发布时间 + publish_times = [] + for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}): + if publish_time.get('content'): + publish_times.append(str(publish_time.get('content'))) + else: + publish_time.append('None') + + + #寻找关键词 + keywords_list=[] + for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}): + if keywords.get('content'): + keywords_list.append(keywords.get('content')) + else: + keywords_list.append('None') + + + #寻找doi + dois = [] + for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}): + dois.append(str(doi.get('content'))) + doi_separated = ";\n".join(dois) + + + #寻找volume + volumes=[] + for volume in sub_soup.find_all('meta',{'name':'citation_volume'}): + if volume.get('content'): + volumes.append(volume.get('content')) + else: + volumes.append('None') + volume_separated = ";\n".join(volumes) + + #寻找issue + issues=[] + for issue in sub_soup.find_all('meta',{'name':'citation_issue'}): + + issues.append(issue.get('content')) + issue_separated = ";\n".join(issues) + + + #寻找首页 + firstpages=[] + for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}): + firstpages.append(firstpage.get('content')) + + + #寻找尾页 + lastpages=[] + for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}): + lastpages.append(lastpage.get('content')) + + #寻找MSC + MSC=[] + for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}): + MSC.append(msc.get('content')) + MSC_separated = ";\n".join(MSC) + + all_d={"article_id:":str(uuid.uuid4()), + "Author":authors, + "correspond_author":"null", + "Title":titles, + "Publish Time":publish_times, + "keywords":keywords_list, + "DOI":doi_separated , + "volume":volume_separated , + "issue":issue_separated, + "url":full_sub_url, + "page": "-".join(firstpages) + "-" + "-".join(lastpages), + "journal":"projecteuclid.org", + "MSC":MSC_separated} + #print(all_d) + # 写入JSON文件 + + all_d_list.append(all_d) + + # 将信息存储到列表中 + # all_d_list.append(...) + +# 输出存储的信息 +# print(all_d_list) +with open('articles.json', 'w') as f: + json.dump(all_d_list, f, indent=2) + +print("JSON文件已成功生成。") +