From cfa9345a797868844ed200b458b11b801421b8f9 Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Wed, 26 Jul 2023 23:25:30 +0800 Subject: [PATCH] Update a new spider code for math.u-szeged.hu/ejqtde. Modified the code of SpringerOpen_spider --- EJQTDE_spider/ejqtde_href_multithread.py | 111 ++++++++++++++ EJQTDE_spider/ejqtde_save.py | 93 +++++++++++ EJQTDE_spider/ejqtde_scrawler.py | 187 +++++++++++++++++++++++ SpringerOpen_spider/SD_detail.py | 4 +- SpringerOpen_spider/SD_header.py | 25 --- SpringerOpen_spider/SD_link.py | 22 +++ SpringerOpen_spider/SD_main.py | 5 +- SpringerOpen_spider/SD_save.py | 13 +- SpringerOpen_spider/SD_scrawl.py | 3 +- 9 files changed, 427 insertions(+), 36 deletions(-) create mode 100644 EJQTDE_spider/ejqtde_href_multithread.py create mode 100644 EJQTDE_spider/ejqtde_save.py create mode 100644 EJQTDE_spider/ejqtde_scrawler.py delete mode 100644 SpringerOpen_spider/SD_header.py diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_href_multithread.py new file mode 100644 index 0000000..311feca --- /dev/null +++ b/EJQTDE_spider/ejqtde_href_multithread.py @@ -0,0 +1,111 @@ +import re +import datetime +import threading +import urllib +import ejqtde_scrawler +import ejqtde_save + +from selenium import webdriver +from bs4 import BeautifulSoup +from selenium.webdriver.edge.options import Options +from concurrent.futures import ThreadPoolExecutor, as_completed, wait +from urllib.parse import urljoin + +''' + 爬取网站:'https://www.math.u-szeged.hu/ejqtde + + ==========运行顺序========== + 1、ejqtde_href_multithread 获取各年份的期刊链接 + 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 + 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) +''' + + +# Multithread pool +def extract_href(link): + driver = webdriver.Edge(options=options) + driver.get(link) + html_code = driver.page_source + soup = BeautifulSoup(html_code, 'html.parser') + column_right = soup.find('div', id='columnRight') + if column_right: + ordered_lists = column_right.find_all('ol') + for idx, ordered_list in enumerate(ordered_lists, 1): + for list_item in ordered_list.find_all('li'): + matches = re.findall(r': = 50: + with locks: + count1 += len(Article_list) + ejqtde_save.save_data(Article_list, "Article_TS") + Article_list.clear() + + if len(Author_list) >= 50: + with locks: + count2 += len(Author_list) + ejqtde_save.save_data(Author_list, "Author_TS") + Author_list.clear() + wait(futures) + + # Deal with the remaining data + if len(Article_list) > 0: + count1 += len(Article_list) + ejqtde_save.save_data(Article_list, "Article_TS") + Article_list.clear() + print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/') + if len(Author_list) > 0: + count2 += len(Author_list) + ejqtde_save.save_data(Author_list, "Author_TS") + Author_list.clear() + print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/') + +print('\nThe whole scrawler program has been done\n') +print(count1, ' article_data has been stored.') +print(count2, ' author_data has been stored.') + +ejqtde_save.Transf() +ejqtde_save.delete() \ No newline at end of file diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py new file mode 100644 index 0000000..693159a --- /dev/null +++ b/EJQTDE_spider/ejqtde_save.py @@ -0,0 +1,93 @@ +import os +import json +import uuid + + +# Save into files +def save_data(dataset, filetype): + if dataset: + filename = str(uuid.uuid4()) + ".json" + directory = "./EJQTDE_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# Summary files +def Transf(): + def Read(folder_path, output_files): + # 新建文件夹 + os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True) + os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True) + + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # The path of reading + author_folder_path = './EJQTDE_buffer/Author_TS' + article_folder_path = './EJQTDE_buffer/Article_TS' + + # The path of storage + author_output_file = [ + './EJQTDE_buffer/Author_output/Author_output_file(oldest).json', + './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json', + './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json', + './EJQTDE_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './EJQTDE_buffer/Article_output/Article_output_file(oldest).json', + './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json', + './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json', + './EJQTDE_buffer/Article_output/Article_output_file(newest).json' + ] + + # Read and write into files + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + + +# Delete files in temporary storage area +def delete(): + folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py new file mode 100644 index 0000000..1f3975d --- /dev/null +++ b/EJQTDE_spider/ejqtde_scrawler.py @@ -0,0 +1,187 @@ +import time +import uuid +import re +import urllib + +from selenium.webdriver.edge.options import Options +from selenium import webdriver +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +# Get the information in the webpage through selenium +def source(driver, num): + if driver.find_elements(by='id', value='columnRight'): + html_code = driver.page_source + soup = BeautifulSoup(html_code, 'html.parser') + return soup + elif num == 5: + print('Out of times!') + driver.quit() + return None + else: + num += 1 + time.sleep(3) + return source(driver, num) + + +# Get the links of the authors' information +def author_links(Data): + Author_links = [] + Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&' + 'paramtipus_ertek=person_data&param_ertek=\d+') + Author_hrefs = re.findall(Author_hrefs_pattern, str(Data)) + for Author_href in Author_hrefs: + Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href) + Author_links.append(Author_href) + + return Author_links + + +# Get the information of the authors +def author_detail(Data, Year, article_id, Author_list): + # Name + author = Data.find('p', class_='publication_head').get_text() + + author = author.split(',') + author = [char.replace(' ', '') for char in author] + + Firstname = author[0] + Lastname = author[-1] + Middlename = ''.join(author[1:-1]) if author[1:-1] else None + + # infor + table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'}) + Td = table.find_all('td') + line = [td for td in Td] + + # Affiliation + Affiliation = line[1].get_text() + + # Email + Email = line[0].find('a').get('href') + + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": Firstname, + "lastname": Lastname, + "middlename": Middlename, + "affiliation": [ + { + "year": Year, + "affiliation": Affiliation, + "email": Email + } + ] + } + + Author_list.append(author_data) + return Author_list + + +# Get the article's information +def article_detail(Data, URL, article_id, Aricle_list): + # Title + font = Data.find('font', attrs={'size': '+1'}) + Title = font.find('b').get_text() + + # Author and Corresponding_authors + author_pattern = re.compile(r'periodica\.html\?periodica=1&' + r'paramtipus_ertek=person_data&param_ertek=\d+">(.*?)') + Author = re.findall(author_pattern, str(Data)) + Corresponding_author = Author[-1] # Corresponding_authors + del Author[-1] + + # Submit_datetime and publish_datetime + time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) + time = re.findall(r'\d+-\d+-\d+', str(time)) + Submit_date = time[0] if time[0] else None + Publish_date = time[1] if time[1] else None + + # Keyword + Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None + Keyword = Keyword.split(', ') if Keyword is not None else None + + # MSC + MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None + MSC = MSC.split(', ') if MSC is not None else None + + # DOI + if len(re.findall(r' 0: + DOI = re.findall(r'(\d+)', str(Data))[0] + + # Issue and page + result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text() + Issue = re.findall(r'(\d+), \d+-\d+', result)[0] + Page = re.findall(r'\d+, (\d+-\d+)', result)[0] + + article_data = { + "article_id": article_id, + "title": Title, + "authors": Author, + "corresponding_authors": Corresponding_author, + "submit_datetime": Submit_date, + "publish_datetime": Publish_date, + "keywords": Keyword, + "MSC": MSC, + "URL": URL, + "DOI": DOI, + "publisher": Publisher, + "journal": Journal, + "volume": Volume, + "issue": Issue, + "page": Page, + } + + Aricle_list.append(article_data) + return Aricle_list + + +# Main code of scrawler +def scrawler(URL, lock, Article_list, Author_list): + print('Start: ', URL) + driver = webdriver.Edge(options=options) + driver.get(URL) + + # Enter the detail page + Max_retryTimes = 3 + Essay_data = source(driver, Max_retryTimes) + if Essay_data is not None: + article_id = str(uuid.uuid4()) + Article_list = article_detail(Essay_data, URL, article_id, Article_list) + + # Get the authors' information + Year = re.findall(r'(\d+)', str(Essay_data))[0] + for author_link in author_links(Essay_data): + driver.get(author_link) + Author_detail = source(driver, Max_retryTimes) + Author_list = author_detail(Author_detail, Year, article_id, Author_list) + + + print('Complete: ', URL) + driver.quit() + + else: + print('Wrong: Some error occurred: ', URL) + pass + + +# Options setting +options = Options() +options.add_argument('--headless') # Run Edge in headless mode +options.add_argument('disable-gpu') # Disable GPU acceleration +options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none' + + diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py index 8dd02dc..73639c7 100644 --- a/SpringerOpen_spider/SD_detail.py +++ b/SpringerOpen_spider/SD_detail.py @@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id): time = time.get_text() Time.append(time) - Submitted_date = Time[0] + Submit_date = Time[0] Publish_date = Time[-1] # keyword @@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id): "title": Title, "authors": Author, "corresponding_authors": Corresponding_author, - "submit_datetime": Submitted_date, + "submit_datetime": Submit_date, "publish_datetime": Publish_date, "keywords": Keyword, "MSC": MSC, diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py deleted file mode 100644 index 162e82d..0000000 --- a/SpringerOpen_spider/SD_header.py +++ /dev/null @@ -1,25 +0,0 @@ -import random - -# 用户代理地址池 -uapools=[ - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", - "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", - "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', - 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', - 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', -] - -def header(): - # 网站请求头 - headers = { - 'User-Agent': random.choice(uapools), - } - - return headers - - - diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py index a46d542..1b5eed1 100644 --- a/SpringerOpen_spider/SD_link.py +++ b/SpringerOpen_spider/SD_link.py @@ -1,6 +1,28 @@ +import random import requests from bs4 import BeautifulSoup +# 用户代理地址池 +uapools=[ + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', + 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', +] + +def header(): + # 网站请求头 + headers = { + 'User-Agent': random.choice(uapools), + } + + return headers + # 标准访问格式 def Link(url, headers): try: diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py index e94fd72..c56f993 100644 --- a/SpringerOpen_spider/SD_main.py +++ b/SpringerOpen_spider/SD_main.py @@ -1,6 +1,5 @@ import urllib -import SD_header import SD_link import SD_threads import SD_save @@ -20,8 +19,8 @@ from urllib.parse import urljoin # 存放网页链接的空列表 -Links = [] # A list for links Webs = [] # A list for web url +Links = [] # A list for links # 存放爬取数据的空列表 Article_data = [] @@ -29,7 +28,7 @@ Author_data = [] # ==========访问论文列表页========== # 获取数学类期刊网站链接 -headers = SD_header.header() +headers = SD_link.header() soup = SD_link.Link('https://www.springeropen.com/journals', headers) hrefs = soup.find('ol', id='Mathematics-list') diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py index b088f5c..713fc76 100644 --- a/SpringerOpen_spider/SD_save.py +++ b/SpringerOpen_spider/SD_save.py @@ -20,6 +20,11 @@ def Transf(): os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True) os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True) + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + for filename in os.listdir(folder_path): if filename.endswith('.json'): file_path = os.path.join(folder_path, filename) @@ -27,16 +32,16 @@ def Transf(): data = json.load(file) # 筛选文章 - data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int( + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int( + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py index c79f41a..d2b8ed2 100644 --- a/SpringerOpen_spider/SD_scrawl.py +++ b/SpringerOpen_spider/SD_scrawl.py @@ -3,7 +3,6 @@ import urllib import uuid from urllib.parse import urljoin -import SD_header import SD_link import SD_detail import SD_save @@ -11,7 +10,7 @@ import SD_save # ==========获取论文详情页链接========== def Scrawl(Link, Article_data, Author_data): # 访问论文列表页 - headers = SD_header.header() + headers = SD_link.header() soup = SD_link.Link(Link, headers) print(Link)