From 26fed37e17cce99dfab45043d0bd9d8630e4d7c5 Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Thu, 27 Jul 2023 10:26:02 +0800 Subject: [PATCH] Modified old code --- EJDE_spider/Transf.py | 38 ---- .../{ejde_scrawler.py => ejde_main.py} | 47 ++--- EJDE_spider/ejde_save.py | 93 ++++++++++ ...tde_href_multithread.py => ejqtde_main.py} | 6 +- EJQTDE_spider/ejqtde_save.py | 33 ++-- ProjectEuclid_spider/projecteuclid_main | 168 ++++++++++++++++++ SpringerOpen_spider/SD_save.py | 30 ++-- 7 files changed, 324 insertions(+), 91 deletions(-) delete mode 100644 EJDE_spider/Transf.py rename EJDE_spider/{ejde_scrawler.py => ejde_main.py} (82%) create mode 100644 EJDE_spider/ejde_save.py rename EJQTDE_spider/{ejqtde_href_multithread.py => ejqtde_main.py} (94%) create mode 100644 ProjectEuclid_spider/projecteuclid_main diff --git a/EJDE_spider/Transf.py b/EJDE_spider/Transf.py deleted file mode 100644 index dce6a10..0000000 --- a/EJDE_spider/Transf.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import json - -# Function -# Get the data from input files -def Read(folder_path): - data = [] - - for filename in os.listdir(folder_path): - if filename.endswith('.json'): - file_path = os.path.join(folder_path, filename) - with open(file_path, 'r', encoding='utf-8') as file: - data.extend(json.load(file)) - return data - -# Write into output files -def Write(data, output_file): - with open(output_file, 'w', encoding='utf-8') as file: - json.dump(data, file, indent=4) - -# Path of files need to be read -folder_path1 = '.\ejde_buffer\Author' -folder_path2 = '.\ejde_buffer\Article' - -# Read the data in the files -Author_data = Read(folder_path1) -Article_data = Read(folder_path2) - -# The path of output files -output_file1 = '.\ejde_buffer\Author_output_file.json' -output_file2 = '.\ejde_buffer\Article_output_file.json' - -# Write into files -Write(Author_data, output_file1) -Write(Article_data, output_file2) - -# End -print("\nData has been written into files.") \ No newline at end of file diff --git a/EJDE_spider/ejde_scrawler.py b/EJDE_spider/ejde_main.py similarity index 82% rename from EJDE_spider/ejde_scrawler.py rename to EJDE_spider/ejde_main.py index 49b95e0..ec679e8 100644 --- a/EJDE_spider/ejde_scrawler.py +++ b/EJDE_spider/ejde_main.py @@ -1,23 +1,22 @@ -import os import uuid import requests -from bs4 import BeautifulSoup import re -import json +import ejde_save + from concurrent.futures import ThreadPoolExecutor, as_completed from retrying import retry +from bs4 import BeautifulSoup +''' + 爬取网站:'ejde.math.txstate.edu' -def save_data(dataset, filetype, filename): - if dataset: - directory = "./ejde_buffer/" + filetype + "/" - os.makedirs(directory, exist_ok=True) - filepath = os.path.join(directory, filename) - with open(filepath, "w", encoding='utf-8') as json_file: - json.dump(dataset, json_file, indent=4) - print(filetype + " data have been added to", filepath) - + ==========运行顺序========== + 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 + 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) +''' +# Article and author detail @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(url): response = requests.get(url) @@ -43,7 +42,7 @@ def process_article(url): # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) - volume = volume_match.group(1) if volume_match else None + volume = str(volume_match.group(1)) if volume_match else None # year = volume_match.group(2) if volume_match else None # Extract pp @@ -141,11 +140,11 @@ def process_article(url): # Save the data periodically based on batch size if len(articleData) % batch_size == 0: - save_data(articleData, "Article", str(uuid.uuid4()) + ".json") + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") articleData.clear() if len(authorData) % batch_size == 0: - save_data(authorData, "Author", str(uuid.uuid4()) + ".json") + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") authorData.clear() @@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l authorData = [] articleData = [] -batch_size = 500 # Number of articles to process before saving +batch_size = 5 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads # Process each URL using multithreading @@ -176,10 +175,14 @@ for future in as_completed(futures): print("An error occurred:", str(e)) # Save remaining data -if articleData: - save_data(articleData, "Article", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/") +if len(articleData) > 0: + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") -if authorData: - save_data(authorData, "Author", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/") +if len(authorData) > 0: + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") + +# Transfer to large file and delete the temporary storage files +ejde_save.Transf() +ejde_save.delete() diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py new file mode 100644 index 0000000..5b67447 --- /dev/null +++ b/EJDE_spider/ejde_save.py @@ -0,0 +1,93 @@ +import os +import json + + +# Save data +def save_data(dataset, filetype, filename): + if dataset: + directory = "./ejde_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# Write into output files +def Transf(): + def Read(folder_path, output_files): + # Create new folders + os.makedirs('./ejde_buffer/Article_output/', exist_ok=True) + os.makedirs('./ejde_buffer/Author_output/', exist_ok=True) + + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # The path of reading + author_folder_path = './ejde_buffer/Author_TS' + article_folder_path = './ejde_buffer/Article_TS' + + # The path of storage + author_output_file = [ + './ejde_buffer/Author_output/Author_output_file(oldest).json', + './ejde_buffer/Author_output/Author_output_file(2010-2014).json', + './ejde_buffer/Author_output/Author_output_file(2015-2020).json', + './ejde_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './ejde_buffer/Article_output/Article_output_file(oldest).json', + './ejde_buffer/Article_output/Article_output_file(2010-2014).json', + './ejde_buffer/Article_output/Article_output_file(2015-2020).json', + './ejde_buffer/Article_output/Article_output_file(newest).json' + ] + + # Read and write into files + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + + +# Delete files in temporary storage area +def delete(): + folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_main.py similarity index 94% rename from EJQTDE_spider/ejqtde_href_multithread.py rename to EJQTDE_spider/ejqtde_main.py index 311feca..1a045ff 100644 --- a/EJQTDE_spider/ejqtde_href_multithread.py +++ b/EJQTDE_spider/ejqtde_main.py @@ -12,10 +12,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, wait from urllib.parse import urljoin ''' - 爬取网站:'https://www.math.u-szeged.hu/ejqtde + 爬取网站:'https://www.math.u-szeged.hu/ejqtde' ==========运行顺序========== - 1、ejqtde_href_multithread 获取各年份的期刊链接 + 1、ejqtde_main 获取各年份的期刊链接 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) @@ -70,6 +70,7 @@ with ThreadPoolExecutor(max_workers=25) as executor: wait(futures) print('\nAll links have been got.\n') +# Use multithreading to get the data count1 = 0 count2 = 0 locks = threading.Lock() @@ -107,5 +108,6 @@ print('\nThe whole scrawler program has been done\n') print(count1, ' article_data has been stored.') print(count2, ' author_data has been stored.') +# Transfer to large file and delete the temporary storage files ejqtde_save.Transf() ejqtde_save.delete() \ No newline at end of file diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py index 693159a..5ae8e54 100644 --- a/EJQTDE_spider/ejqtde_save.py +++ b/EJQTDE_spider/ejqtde_save.py @@ -18,7 +18,7 @@ def save_data(dataset, filetype): # Summary files def Transf(): def Read(folder_path, output_files): - # 新建文件夹 + # Create new folder os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True) os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True) @@ -33,25 +33,27 @@ def Transf(): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - # Select data - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # Transfer - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # The path of reading author_folder_path = './EJQTDE_buffer/Author_TS' @@ -91,3 +93,4 @@ def delete(): os.remove(file_path) print('\nAttention: The temporary storage files have been deleted!') + diff --git a/ProjectEuclid_spider/projecteuclid_main b/ProjectEuclid_spider/projecteuclid_main new file mode 100644 index 0000000..9ca21b1 --- /dev/null +++ b/ProjectEuclid_spider/projecteuclid_main @@ -0,0 +1,168 @@ +import requests +from bs4 import BeautifulSoup,Tag +import json +import re +import uuid + +main_page_urls = [ + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2" + + +] + +all_d_list = [] + +# 遍历多个主页面的链接 +for main_page_url in main_page_urls: + response = requests.get(main_page_url) + html = response.text + soup = BeautifulSoup(html, "html.parser") + + pattern = re.compile(r'^/journals/differential-and-integral-equations/') + links = soup.find_all("a", href=pattern) + sub_urls = [link["href"] for link in links if link["href"].endswith(".full")] + + # 访问子链接并进行信息提取 + for sub_url in sub_urls: + full_sub_url = "https://projecteuclid.org" + sub_url + sub_response = requests.get(full_sub_url) + sub_html = sub_response.text + + # 执行子界面上的信息提取 + sub_soup = BeautifulSoup(sub_html, "html.parser") + + #寻找作者 + author_tags = sub_soup.find_all('meta', {'name': 'citation_author'}) + + authors = {} # 用于存储作者信息的字典 + + #对每一个的作者信息进行处理 + for i, tag in enumerate(author_tags, 1): + citation_author = tag['content'] + authors[i] = citation_author if citation_author else None + + #寻找文章的基本信息 + titles = [] + for title in sub_soup.find_all('meta',{'name':'citation_title'}): + if title.get('content') is not None: + titles.append(title.get('content')) + + + + + #寻找发布时间 + publish_times = [] + for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}): + if publish_time.get('content'): + publish_times.append(str(publish_time.get('content'))) + else: + publish_time.append('None') + + + #寻找关键词 + keywords_list=[] + for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}): + if keywords.get('content'): + keywords_list.append(keywords.get('content')) + else: + keywords_list.append('None') + + + #寻找doi + dois = [] + for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}): + dois.append(str(doi.get('content'))) + doi_separated = ";\n".join(dois) + + + #寻找volume + volumes=[] + for volume in sub_soup.find_all('meta',{'name':'citation_volume'}): + if volume.get('content'): + volumes.append(volume.get('content')) + else: + volumes.append('None') + volume_separated = ";\n".join(volumes) + + #寻找issue + issues=[] + for issue in sub_soup.find_all('meta',{'name':'citation_issue'}): + + issues.append(issue.get('content')) + issue_separated = ";\n".join(issues) + + + #寻找首页 + firstpages=[] + for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}): + firstpages.append(firstpage.get('content')) + + + #寻找尾页 + lastpages=[] + for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}): + lastpages.append(lastpage.get('content')) + + #寻找MSC + MSC=[] + for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}): + MSC.append(msc.get('content')) + MSC_separated = ";\n".join(MSC) + + all_d={"article_id:":str(uuid.uuid4()), + "Author":authors, + "correspond_author":"null", + "Title":titles, + "Publish Time":publish_times, + "keywords":keywords_list, + "DOI":doi_separated , + "volume":volume_separated , + "issue":issue_separated, + "url":full_sub_url, + "page": "-".join(firstpages) + "-" + "-".join(lastpages), + "journal":"projecteuclid.org", + "MSC":MSC_separated} + #print(all_d) + # 写入JSON文件 + + all_d_list.append(all_d) + + # 将信息存储到列表中 + # all_d_list.append(...) + +# 输出存储的信息 +# print(all_d_list) +with open('articles.json', 'w') as f: + json.dump(all_d_list, f, indent=2) + +print("JSON文件已成功生成。") + diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py index 713fc76..0d7119d 100644 --- a/SpringerOpen_spider/SD_save.py +++ b/SpringerOpen_spider/SD_save.py @@ -31,25 +31,27 @@ def Transf(): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - # 筛选文章 - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # 筛选文章 + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # 转存 - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + # 转存 + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # 读取路径