diff --git a/projecteuclid_spider b/projecteuclid_spider deleted file mode 100644 index 9ca21b1..0000000 --- a/projecteuclid_spider +++ /dev/null @@ -1,168 +0,0 @@ -import requests -from bs4 import BeautifulSoup,Tag -import json -import re -import uuid - -main_page_urls = [ - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2" - - -] - -all_d_list = [] - -# 遍历多个主页面的链接 -for main_page_url in main_page_urls: - response = requests.get(main_page_url) - html = response.text - soup = BeautifulSoup(html, "html.parser") - - pattern = re.compile(r'^/journals/differential-and-integral-equations/') - links = soup.find_all("a", href=pattern) - sub_urls = [link["href"] for link in links if link["href"].endswith(".full")] - - # 访问子链接并进行信息提取 - for sub_url in sub_urls: - full_sub_url = "https://projecteuclid.org" + sub_url - sub_response = requests.get(full_sub_url) - sub_html = sub_response.text - - # 执行子界面上的信息提取 - sub_soup = BeautifulSoup(sub_html, "html.parser") - - #寻找作者 - author_tags = sub_soup.find_all('meta', {'name': 'citation_author'}) - - authors = {} # 用于存储作者信息的字典 - - #对每一个的作者信息进行处理 - for i, tag in enumerate(author_tags, 1): - citation_author = tag['content'] - authors[i] = citation_author if citation_author else None - - #寻找文章的基本信息 - titles = [] - for title in sub_soup.find_all('meta',{'name':'citation_title'}): - if title.get('content') is not None: - titles.append(title.get('content')) - - - - - #寻找发布时间 - publish_times = [] - for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}): - if publish_time.get('content'): - publish_times.append(str(publish_time.get('content'))) - else: - publish_time.append('None') - - - #寻找关键词 - keywords_list=[] - for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}): - if keywords.get('content'): - keywords_list.append(keywords.get('content')) - else: - keywords_list.append('None') - - - #寻找doi - dois = [] - for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}): - dois.append(str(doi.get('content'))) - doi_separated = ";\n".join(dois) - - - #寻找volume - volumes=[] - for volume in sub_soup.find_all('meta',{'name':'citation_volume'}): - if volume.get('content'): - volumes.append(volume.get('content')) - else: - volumes.append('None') - volume_separated = ";\n".join(volumes) - - #寻找issue - issues=[] - for issue in sub_soup.find_all('meta',{'name':'citation_issue'}): - - issues.append(issue.get('content')) - issue_separated = ";\n".join(issues) - - - #寻找首页 - firstpages=[] - for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}): - firstpages.append(firstpage.get('content')) - - - #寻找尾页 - lastpages=[] - for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}): - lastpages.append(lastpage.get('content')) - - #寻找MSC - MSC=[] - for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}): - MSC.append(msc.get('content')) - MSC_separated = ";\n".join(MSC) - - all_d={"article_id:":str(uuid.uuid4()), - "Author":authors, - "correspond_author":"null", - "Title":titles, - "Publish Time":publish_times, - "keywords":keywords_list, - "DOI":doi_separated , - "volume":volume_separated , - "issue":issue_separated, - "url":full_sub_url, - "page": "-".join(firstpages) + "-" + "-".join(lastpages), - "journal":"projecteuclid.org", - "MSC":MSC_separated} - #print(all_d) - # 写入JSON文件 - - all_d_list.append(all_d) - - # 将信息存储到列表中 - # all_d_list.append(...) - -# 输出存储的信息 -# print(all_d_list) -with open('articles.json', 'w') as f: - json.dump(all_d_list, f, indent=2) - -print("JSON文件已成功生成。") -