diff --git a/EJDE_spider/Transf.py b/EJDE_spider/Transf.py deleted file mode 100644 index dce6a10..0000000 --- a/EJDE_spider/Transf.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import json - -# Function -# Get the data from input files -def Read(folder_path): - data = [] - - for filename in os.listdir(folder_path): - if filename.endswith('.json'): - file_path = os.path.join(folder_path, filename) - with open(file_path, 'r', encoding='utf-8') as file: - data.extend(json.load(file)) - return data - -# Write into output files -def Write(data, output_file): - with open(output_file, 'w', encoding='utf-8') as file: - json.dump(data, file, indent=4) - -# Path of files need to be read -folder_path1 = '.\ejde_buffer\Author' -folder_path2 = '.\ejde_buffer\Article' - -# Read the data in the files -Author_data = Read(folder_path1) -Article_data = Read(folder_path2) - -# The path of output files -output_file1 = '.\ejde_buffer\Author_output_file.json' -output_file2 = '.\ejde_buffer\Article_output_file.json' - -# Write into files -Write(Author_data, output_file1) -Write(Article_data, output_file2) - -# End -print("\nData has been written into files.") \ No newline at end of file diff --git a/EJDE_spider/ejde_scrawler.py b/EJDE_spider/ejde_main.py similarity index 82% rename from EJDE_spider/ejde_scrawler.py rename to EJDE_spider/ejde_main.py index 49b95e0..ec679e8 100644 --- a/EJDE_spider/ejde_scrawler.py +++ b/EJDE_spider/ejde_main.py @@ -1,23 +1,22 @@ -import os import uuid import requests -from bs4 import BeautifulSoup import re -import json +import ejde_save + from concurrent.futures import ThreadPoolExecutor, as_completed from retrying import retry +from bs4 import BeautifulSoup +''' + 爬取网站:'ejde.math.txstate.edu' -def save_data(dataset, filetype, filename): - if dataset: - directory = "./ejde_buffer/" + filetype + "/" - os.makedirs(directory, exist_ok=True) - filepath = os.path.join(directory, filename) - with open(filepath, "w", encoding='utf-8') as json_file: - json.dump(dataset, json_file, indent=4) - print(filetype + " data have been added to", filepath) - + ==========运行顺序========== + 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 + 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) +''' +# Article and author detail @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(url): response = requests.get(url) @@ -43,7 +42,7 @@ def process_article(url): # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) - volume = volume_match.group(1) if volume_match else None + volume = str(volume_match.group(1)) if volume_match else None # year = volume_match.group(2) if volume_match else None # Extract pp @@ -141,11 +140,11 @@ def process_article(url): # Save the data periodically based on batch size if len(articleData) % batch_size == 0: - save_data(articleData, "Article", str(uuid.uuid4()) + ".json") + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") articleData.clear() if len(authorData) % batch_size == 0: - save_data(authorData, "Author", str(uuid.uuid4()) + ".json") + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") authorData.clear() @@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l authorData = [] articleData = [] -batch_size = 500 # Number of articles to process before saving +batch_size = 5 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads # Process each URL using multithreading @@ -176,10 +175,14 @@ for future in as_completed(futures): print("An error occurred:", str(e)) # Save remaining data -if articleData: - save_data(articleData, "Article", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/") +if len(articleData) > 0: + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") -if authorData: - save_data(authorData, "Author", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/") +if len(authorData) > 0: + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") + +# Transfer to large file and delete the temporary storage files +ejde_save.Transf() +ejde_save.delete() diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py new file mode 100644 index 0000000..5b67447 --- /dev/null +++ b/EJDE_spider/ejde_save.py @@ -0,0 +1,93 @@ +import os +import json + + +# Save data +def save_data(dataset, filetype, filename): + if dataset: + directory = "./ejde_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# Write into output files +def Transf(): + def Read(folder_path, output_files): + # Create new folders + os.makedirs('./ejde_buffer/Article_output/', exist_ok=True) + os.makedirs('./ejde_buffer/Author_output/', exist_ok=True) + + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # The path of reading + author_folder_path = './ejde_buffer/Author_TS' + article_folder_path = './ejde_buffer/Article_TS' + + # The path of storage + author_output_file = [ + './ejde_buffer/Author_output/Author_output_file(oldest).json', + './ejde_buffer/Author_output/Author_output_file(2010-2014).json', + './ejde_buffer/Author_output/Author_output_file(2015-2020).json', + './ejde_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './ejde_buffer/Article_output/Article_output_file(oldest).json', + './ejde_buffer/Article_output/Article_output_file(2010-2014).json', + './ejde_buffer/Article_output/Article_output_file(2015-2020).json', + './ejde_buffer/Article_output/Article_output_file(newest).json' + ] + + # Read and write into files + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + + +# Delete files in temporary storage area +def delete(): + folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') diff --git a/EJQTDE_spider/ejqtde_main.py b/EJQTDE_spider/ejqtde_main.py new file mode 100644 index 0000000..4917e6c --- /dev/null +++ b/EJQTDE_spider/ejqtde_main.py @@ -0,0 +1,113 @@ +import re +import datetime +import threading +import urllib +import ejqtde_scrawler +import ejqtde_save + +from selenium import webdriver +from bs4 import BeautifulSoup +from selenium.webdriver.edge.options import Options +from concurrent.futures import ThreadPoolExecutor, as_completed, wait +from urllib.parse import urljoin + +''' + 爬取网站:'https://www.math.u-szeged.hu/ejqtde' + + ==========运行顺序========== + 1、ejqtde_main 获取各年份的期刊链接 + 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 + 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) +''' + + +# Multithread pool +def extract_href(link): + driver = webdriver.Edge(options=options) + driver.get(link) + html_code = driver.page_source + soup = BeautifulSoup(html_code, 'html.parser') + column_right = soup.find('div', id='columnRight') + if column_right: + ordered_lists = column_right.find_all('ol') + for idx, ordered_list in enumerate(ordered_lists, 1): + for list_item in ordered_list.find_all('li'): + matches = re.findall(r': = 50: + with locks: + count1 += len(Article_list) + ejqtde_save.save_data(Article_list, "Article_TS") + Article_list.clear() + + if len(Author_list) >= 50: + with locks: + count2 += len(Author_list) + ejqtde_save.save_data(Author_list, "Author_TS") + Author_list.clear() + wait(futures) + + # Deal with the remaining data + if len(Article_list) > 0: + count1 += len(Article_list) + ejqtde_save.save_data(Article_list, "Article_TS") + Article_list.clear() + print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/') + if len(Author_list) > 0: + count2 += len(Author_list) + ejqtde_save.save_data(Author_list, "Author_TS") + Author_list.clear() + print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/') + +print('\nThe whole scrawler program has been done\n') +print(count1, ' article_data has been stored.') +print(count2, ' author_data has been stored.') + +# Transfer to large file and delete the temporary storage files +ejqtde_save.Transf() +ejqtde_save.delete() \ No newline at end of file diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py new file mode 100644 index 0000000..5ae8e54 --- /dev/null +++ b/EJQTDE_spider/ejqtde_save.py @@ -0,0 +1,96 @@ +import os +import json +import uuid + + +# Save into files +def save_data(dataset, filetype): + if dataset: + filename = str(uuid.uuid4()) + ".json" + directory = "./EJQTDE_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# Summary files +def Transf(): + def Read(folder_path, output_files): + # Create new folder + os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True) + os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True) + + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # The path of reading + author_folder_path = './EJQTDE_buffer/Author_TS' + article_folder_path = './EJQTDE_buffer/Article_TS' + + # The path of storage + author_output_file = [ + './EJQTDE_buffer/Author_output/Author_output_file(oldest).json', + './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json', + './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json', + './EJQTDE_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './EJQTDE_buffer/Article_output/Article_output_file(oldest).json', + './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json', + './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json', + './EJQTDE_buffer/Article_output/Article_output_file(newest).json' + ] + + # Read and write into files + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + + +# Delete files in temporary storage area +def delete(): + folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') + diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py new file mode 100644 index 0000000..1f3975d --- /dev/null +++ b/EJQTDE_spider/ejqtde_scrawler.py @@ -0,0 +1,187 @@ +import time +import uuid +import re +import urllib + +from selenium.webdriver.edge.options import Options +from selenium import webdriver +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +# Get the information in the webpage through selenium +def source(driver, num): + if driver.find_elements(by='id', value='columnRight'): + html_code = driver.page_source + soup = BeautifulSoup(html_code, 'html.parser') + return soup + elif num == 5: + print('Out of times!') + driver.quit() + return None + else: + num += 1 + time.sleep(3) + return source(driver, num) + + +# Get the links of the authors' information +def author_links(Data): + Author_links = [] + Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&' + 'paramtipus_ertek=person_data&param_ertek=\d+') + Author_hrefs = re.findall(Author_hrefs_pattern, str(Data)) + for Author_href in Author_hrefs: + Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href) + Author_links.append(Author_href) + + return Author_links + + +# Get the information of the authors +def author_detail(Data, Year, article_id, Author_list): + # Name + author = Data.find('p', class_='publication_head').get_text() + + author = author.split(',') + author = [char.replace(' ', '') for char in author] + + Firstname = author[0] + Lastname = author[-1] + Middlename = ''.join(author[1:-1]) if author[1:-1] else None + + # infor + table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'}) + Td = table.find_all('td') + line = [td for td in Td] + + # Affiliation + Affiliation = line[1].get_text() + + # Email + Email = line[0].find('a').get('href') + + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": Firstname, + "lastname": Lastname, + "middlename": Middlename, + "affiliation": [ + { + "year": Year, + "affiliation": Affiliation, + "email": Email + } + ] + } + + Author_list.append(author_data) + return Author_list + + +# Get the article's information +def article_detail(Data, URL, article_id, Aricle_list): + # Title + font = Data.find('font', attrs={'size': '+1'}) + Title = font.find('b').get_text() + + # Author and Corresponding_authors + author_pattern = re.compile(r'periodica\.html\?periodica=1&' + r'paramtipus_ertek=person_data&param_ertek=\d+">(.*?)') + Author = re.findall(author_pattern, str(Data)) + Corresponding_author = Author[-1] # Corresponding_authors + del Author[-1] + + # Submit_datetime and publish_datetime + time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) + time = re.findall(r'\d+-\d+-\d+', str(time)) + Submit_date = time[0] if time[0] else None + Publish_date = time[1] if time[1] else None + + # Keyword + Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None + Keyword = Keyword.split(', ') if Keyword is not None else None + + # MSC + MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None + MSC = MSC.split(', ') if MSC is not None else None + + # DOI + if len(re.findall(r' 0: + DOI = re.findall(r'(\d+)', str(Data))[0] + + # Issue and page + result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text() + Issue = re.findall(r'(\d+), \d+-\d+', result)[0] + Page = re.findall(r'\d+, (\d+-\d+)', result)[0] + + article_data = { + "article_id": article_id, + "title": Title, + "authors": Author, + "corresponding_authors": Corresponding_author, + "submit_datetime": Submit_date, + "publish_datetime": Publish_date, + "keywords": Keyword, + "MSC": MSC, + "URL": URL, + "DOI": DOI, + "publisher": Publisher, + "journal": Journal, + "volume": Volume, + "issue": Issue, + "page": Page, + } + + Aricle_list.append(article_data) + return Aricle_list + + +# Main code of scrawler +def scrawler(URL, lock, Article_list, Author_list): + print('Start: ', URL) + driver = webdriver.Edge(options=options) + driver.get(URL) + + # Enter the detail page + Max_retryTimes = 3 + Essay_data = source(driver, Max_retryTimes) + if Essay_data is not None: + article_id = str(uuid.uuid4()) + Article_list = article_detail(Essay_data, URL, article_id, Article_list) + + # Get the authors' information + Year = re.findall(r'(\d+)', str(Essay_data))[0] + for author_link in author_links(Essay_data): + driver.get(author_link) + Author_detail = source(driver, Max_retryTimes) + Author_list = author_detail(Author_detail, Year, article_id, Author_list) + + + print('Complete: ', URL) + driver.quit() + + else: + print('Wrong: Some error occurred: ', URL) + pass + + +# Options setting +options = Options() +options.add_argument('--headless') # Run Edge in headless mode +options.add_argument('disable-gpu') # Disable GPU acceleration +options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none' + + diff --git a/projecteuclid_spider b/ProjectEuclid_spider/projecteuclid_main similarity index 100% rename from projecteuclid_spider rename to ProjectEuclid_spider/projecteuclid_main diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py index 8dd02dc..73639c7 100644 --- a/SpringerOpen_spider/SD_detail.py +++ b/SpringerOpen_spider/SD_detail.py @@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id): time = time.get_text() Time.append(time) - Submitted_date = Time[0] + Submit_date = Time[0] Publish_date = Time[-1] # keyword @@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id): "title": Title, "authors": Author, "corresponding_authors": Corresponding_author, - "submit_datetime": Submitted_date, + "submit_datetime": Submit_date, "publish_datetime": Publish_date, "keywords": Keyword, "MSC": MSC, diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py deleted file mode 100644 index 162e82d..0000000 --- a/SpringerOpen_spider/SD_header.py +++ /dev/null @@ -1,25 +0,0 @@ -import random - -# 用户代理地址池 -uapools=[ - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", - "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", - "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', - 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', - 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', -] - -def header(): - # 网站请求头 - headers = { - 'User-Agent': random.choice(uapools), - } - - return headers - - - diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py index a46d542..1b5eed1 100644 --- a/SpringerOpen_spider/SD_link.py +++ b/SpringerOpen_spider/SD_link.py @@ -1,6 +1,28 @@ +import random import requests from bs4 import BeautifulSoup +# 用户代理地址池 +uapools=[ + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', + 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', +] + +def header(): + # 网站请求头 + headers = { + 'User-Agent': random.choice(uapools), + } + + return headers + # 标准访问格式 def Link(url, headers): try: diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py index e94fd72..c56f993 100644 --- a/SpringerOpen_spider/SD_main.py +++ b/SpringerOpen_spider/SD_main.py @@ -1,6 +1,5 @@ import urllib -import SD_header import SD_link import SD_threads import SD_save @@ -20,8 +19,8 @@ from urllib.parse import urljoin # 存放网页链接的空列表 -Links = [] # A list for links Webs = [] # A list for web url +Links = [] # A list for links # 存放爬取数据的空列表 Article_data = [] @@ -29,7 +28,7 @@ Author_data = [] # ==========访问论文列表页========== # 获取数学类期刊网站链接 -headers = SD_header.header() +headers = SD_link.header() soup = SD_link.Link('https://www.springeropen.com/journals', headers) hrefs = soup.find('ol', id='Mathematics-list') diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py index b088f5c..0d7119d 100644 --- a/SpringerOpen_spider/SD_save.py +++ b/SpringerOpen_spider/SD_save.py @@ -20,31 +20,38 @@ def Transf(): os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True) os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True) + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + for filename in os.listdir(folder_path): if filename.endswith('.json'): file_path = os.path.join(folder_path, filename) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - # 筛选文章 - data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # 筛选文章 + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # 转存 - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + # 转存 + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # 读取路径 diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py index c79f41a..d2b8ed2 100644 --- a/SpringerOpen_spider/SD_scrawl.py +++ b/SpringerOpen_spider/SD_scrawl.py @@ -3,7 +3,6 @@ import urllib import uuid from urllib.parse import urljoin -import SD_header import SD_link import SD_detail import SD_save @@ -11,7 +10,7 @@ import SD_save # ==========获取论文详情页链接========== def Scrawl(Link, Article_data, Author_data): # 访问论文列表页 - headers = SD_header.header() + headers = SD_link.header() soup = SD_link.Link(Link, headers) print(Link)