import requests from bs4 import BeautifulSoup,Tag import json import re import uuid main_page_urls = [ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4", "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2" ] all_d_list = [] # 遍历多个主页面的链接 for main_page_url in main_page_urls: response = requests.get(main_page_url) html = response.text soup = BeautifulSoup(html, "html.parser") pattern = re.compile(r'^/journals/differential-and-integral-equations/') links = soup.find_all("a", href=pattern) sub_urls = [link["href"] for link in links if link["href"].endswith(".full")] # 访问子链接并进行信息提取 for sub_url in sub_urls: full_sub_url = "https://projecteuclid.org" + sub_url sub_response = requests.get(full_sub_url) sub_html = sub_response.text # 执行子界面上的信息提取 sub_soup = BeautifulSoup(sub_html, "html.parser") #寻找作者 author_tags = sub_soup.find_all('meta', {'name': 'citation_author'}) authors = {} # 用于存储作者信息的字典 #对每一个的作者信息进行处理 for i, tag in enumerate(author_tags, 1): citation_author = tag['content'] authors[i] = citation_author if citation_author else None #寻找文章的基本信息 titles = [] for title in sub_soup.find_all('meta',{'name':'citation_title'}): if title.get('content') is not None: titles.append(title.get('content')) #寻找发布时间 publish_times = [] for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}): if publish_time.get('content'): publish_times.append(str(publish_time.get('content'))) else: publish_time.append('None') #寻找关键词 keywords_list=[] for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}): if keywords.get('content'): keywords_list.append(keywords.get('content')) else: keywords_list.append('None') #寻找doi dois = [] for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}): dois.append(str(doi.get('content'))) doi_separated = ";\n".join(dois) #寻找volume volumes=[] for volume in sub_soup.find_all('meta',{'name':'citation_volume'}): if volume.get('content'): volumes.append(volume.get('content')) else: volumes.append('None') volume_separated = ";\n".join(volumes) #寻找issue issues=[] for issue in sub_soup.find_all('meta',{'name':'citation_issue'}): issues.append(issue.get('content')) issue_separated = ";\n".join(issues) #寻找首页 firstpages=[] for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}): firstpages.append(firstpage.get('content')) #寻找尾页 lastpages=[] for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}): lastpages.append(lastpage.get('content')) #寻找MSC MSC=[] for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}): MSC.append(msc.get('content')) MSC_separated = ";\n".join(MSC) all_d={"article_id:":str(uuid.uuid4()), "Author":authors, "correspond_author":"null", "Title":titles, "Publish Time":publish_times, "keywords":keywords_list, "DOI":doi_separated , "volume":volume_separated , "issue":issue_separated, "url":full_sub_url, "page": "-".join(firstpages) + "-" + "-".join(lastpages), "journal":"projecteuclid.org", "MSC":MSC_separated} #print(all_d) # 写入JSON文件 all_d_list.append(all_d) # 将信息存储到列表中 # all_d_list.append(...) # 输出存储的信息 # print(all_d_list) with open('articles.json', 'w') as f: json.dump(all_d_list, f, indent=2) print("JSON文件已成功生成。")