From cfa9345a797868844ed200b458b11b801421b8f9 Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Wed, 26 Jul 2023 23:25:30 +0800 Subject: [PATCH 1/4] Update a new spider code for math.u-szeged.hu/ejqtde. Modified the code of SpringerOpen_spider --- EJQTDE_spider/ejqtde_href_multithread.py | 111 ++++++++++++++ EJQTDE_spider/ejqtde_save.py | 93 +++++++++++ EJQTDE_spider/ejqtde_scrawler.py | 187 +++++++++++++++++++++++ SpringerOpen_spider/SD_detail.py | 4 +- SpringerOpen_spider/SD_header.py | 25 --- SpringerOpen_spider/SD_link.py | 22 +++ SpringerOpen_spider/SD_main.py | 5 +- SpringerOpen_spider/SD_save.py | 13 +- SpringerOpen_spider/SD_scrawl.py | 3 +- 9 files changed, 427 insertions(+), 36 deletions(-) create mode 100644 EJQTDE_spider/ejqtde_href_multithread.py create mode 100644 EJQTDE_spider/ejqtde_save.py create mode 100644 EJQTDE_spider/ejqtde_scrawler.py delete mode 100644 SpringerOpen_spider/SD_header.py diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_href_multithread.py new file mode 100644 index 0000000..311feca --- /dev/null +++ b/EJQTDE_spider/ejqtde_href_multithread.py @@ -0,0 +1,111 @@ +import re +import datetime +import threading +import urllib +import ejqtde_scrawler +import ejqtde_save + +from selenium import webdriver +from bs4 import BeautifulSoup +from selenium.webdriver.edge.options import Options +from concurrent.futures import ThreadPoolExecutor, as_completed, wait +from urllib.parse import urljoin + +''' + 爬取网站:'https://www.math.u-szeged.hu/ejqtde + + ==========运行顺序========== + 1、ejqtde_href_multithread 获取各年份的期刊链接 + 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 + 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) +''' + + +# Multithread pool +def extract_href(link): + driver = webdriver.Edge(options=options) + driver.get(link) + html_code = driver.page_source + soup = BeautifulSoup(html_code, 'html.parser') + column_right = soup.find('div', id='columnRight') + if column_right: + ordered_lists = column_right.find_all('ol') + for idx, ordered_list in enumerate(ordered_lists, 1): + for list_item in ordered_list.find_all('li'): + matches = re.findall(r': = 50: + with locks: + count1 += len(Article_list) + ejqtde_save.save_data(Article_list, "Article_TS") + Article_list.clear() + + if len(Author_list) >= 50: + with locks: + count2 += len(Author_list) + ejqtde_save.save_data(Author_list, "Author_TS") + Author_list.clear() + wait(futures) + + # Deal with the remaining data + if len(Article_list) > 0: + count1 += len(Article_list) + ejqtde_save.save_data(Article_list, "Article_TS") + Article_list.clear() + print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/') + if len(Author_list) > 0: + count2 += len(Author_list) + ejqtde_save.save_data(Author_list, "Author_TS") + Author_list.clear() + print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/') + +print('\nThe whole scrawler program has been done\n') +print(count1, ' article_data has been stored.') +print(count2, ' author_data has been stored.') + +ejqtde_save.Transf() +ejqtde_save.delete() \ No newline at end of file diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py new file mode 100644 index 0000000..693159a --- /dev/null +++ b/EJQTDE_spider/ejqtde_save.py @@ -0,0 +1,93 @@ +import os +import json +import uuid + + +# Save into files +def save_data(dataset, filetype): + if dataset: + filename = str(uuid.uuid4()) + ".json" + directory = "./EJQTDE_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# Summary files +def Transf(): + def Read(folder_path, output_files): + # 新建文件夹 + os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True) + os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True) + + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # The path of reading + author_folder_path = './EJQTDE_buffer/Author_TS' + article_folder_path = './EJQTDE_buffer/Article_TS' + + # The path of storage + author_output_file = [ + './EJQTDE_buffer/Author_output/Author_output_file(oldest).json', + './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json', + './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json', + './EJQTDE_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './EJQTDE_buffer/Article_output/Article_output_file(oldest).json', + './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json', + './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json', + './EJQTDE_buffer/Article_output/Article_output_file(newest).json' + ] + + # Read and write into files + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + + +# Delete files in temporary storage area +def delete(): + folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py new file mode 100644 index 0000000..1f3975d --- /dev/null +++ b/EJQTDE_spider/ejqtde_scrawler.py @@ -0,0 +1,187 @@ +import time +import uuid +import re +import urllib + +from selenium.webdriver.edge.options import Options +from selenium import webdriver +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +# Get the information in the webpage through selenium +def source(driver, num): + if driver.find_elements(by='id', value='columnRight'): + html_code = driver.page_source + soup = BeautifulSoup(html_code, 'html.parser') + return soup + elif num == 5: + print('Out of times!') + driver.quit() + return None + else: + num += 1 + time.sleep(3) + return source(driver, num) + + +# Get the links of the authors' information +def author_links(Data): + Author_links = [] + Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&' + 'paramtipus_ertek=person_data&param_ertek=\d+') + Author_hrefs = re.findall(Author_hrefs_pattern, str(Data)) + for Author_href in Author_hrefs: + Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href) + Author_links.append(Author_href) + + return Author_links + + +# Get the information of the authors +def author_detail(Data, Year, article_id, Author_list): + # Name + author = Data.find('p', class_='publication_head').get_text() + + author = author.split(',') + author = [char.replace(' ', '') for char in author] + + Firstname = author[0] + Lastname = author[-1] + Middlename = ''.join(author[1:-1]) if author[1:-1] else None + + # infor + table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'}) + Td = table.find_all('td') + line = [td for td in Td] + + # Affiliation + Affiliation = line[1].get_text() + + # Email + Email = line[0].find('a').get('href') + + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": Firstname, + "lastname": Lastname, + "middlename": Middlename, + "affiliation": [ + { + "year": Year, + "affiliation": Affiliation, + "email": Email + } + ] + } + + Author_list.append(author_data) + return Author_list + + +# Get the article's information +def article_detail(Data, URL, article_id, Aricle_list): + # Title + font = Data.find('font', attrs={'size': '+1'}) + Title = font.find('b').get_text() + + # Author and Corresponding_authors + author_pattern = re.compile(r'periodica\.html\?periodica=1&' + r'paramtipus_ertek=person_data&param_ertek=\d+">(.*?)') + Author = re.findall(author_pattern, str(Data)) + Corresponding_author = Author[-1] # Corresponding_authors + del Author[-1] + + # Submit_datetime and publish_datetime + time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) + time = re.findall(r'\d+-\d+-\d+', str(time)) + Submit_date = time[0] if time[0] else None + Publish_date = time[1] if time[1] else None + + # Keyword + Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None + Keyword = Keyword.split(', ') if Keyword is not None else None + + # MSC + MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None + MSC = MSC.split(', ') if MSC is not None else None + + # DOI + if len(re.findall(r' 0: + DOI = re.findall(r'(\d+)', str(Data))[0] + + # Issue and page + result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text() + Issue = re.findall(r'(\d+), \d+-\d+', result)[0] + Page = re.findall(r'\d+, (\d+-\d+)', result)[0] + + article_data = { + "article_id": article_id, + "title": Title, + "authors": Author, + "corresponding_authors": Corresponding_author, + "submit_datetime": Submit_date, + "publish_datetime": Publish_date, + "keywords": Keyword, + "MSC": MSC, + "URL": URL, + "DOI": DOI, + "publisher": Publisher, + "journal": Journal, + "volume": Volume, + "issue": Issue, + "page": Page, + } + + Aricle_list.append(article_data) + return Aricle_list + + +# Main code of scrawler +def scrawler(URL, lock, Article_list, Author_list): + print('Start: ', URL) + driver = webdriver.Edge(options=options) + driver.get(URL) + + # Enter the detail page + Max_retryTimes = 3 + Essay_data = source(driver, Max_retryTimes) + if Essay_data is not None: + article_id = str(uuid.uuid4()) + Article_list = article_detail(Essay_data, URL, article_id, Article_list) + + # Get the authors' information + Year = re.findall(r'(\d+)', str(Essay_data))[0] + for author_link in author_links(Essay_data): + driver.get(author_link) + Author_detail = source(driver, Max_retryTimes) + Author_list = author_detail(Author_detail, Year, article_id, Author_list) + + + print('Complete: ', URL) + driver.quit() + + else: + print('Wrong: Some error occurred: ', URL) + pass + + +# Options setting +options = Options() +options.add_argument('--headless') # Run Edge in headless mode +options.add_argument('disable-gpu') # Disable GPU acceleration +options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none' + + diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py index 8dd02dc..73639c7 100644 --- a/SpringerOpen_spider/SD_detail.py +++ b/SpringerOpen_spider/SD_detail.py @@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id): time = time.get_text() Time.append(time) - Submitted_date = Time[0] + Submit_date = Time[0] Publish_date = Time[-1] # keyword @@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id): "title": Title, "authors": Author, "corresponding_authors": Corresponding_author, - "submit_datetime": Submitted_date, + "submit_datetime": Submit_date, "publish_datetime": Publish_date, "keywords": Keyword, "MSC": MSC, diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py deleted file mode 100644 index 162e82d..0000000 --- a/SpringerOpen_spider/SD_header.py +++ /dev/null @@ -1,25 +0,0 @@ -import random - -# 用户代理地址池 -uapools=[ - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", - "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", - "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', - 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', - 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', -] - -def header(): - # 网站请求头 - headers = { - 'User-Agent': random.choice(uapools), - } - - return headers - - - diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py index a46d542..1b5eed1 100644 --- a/SpringerOpen_spider/SD_link.py +++ b/SpringerOpen_spider/SD_link.py @@ -1,6 +1,28 @@ +import random import requests from bs4 import BeautifulSoup +# 用户代理地址池 +uapools=[ + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', + 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', +] + +def header(): + # 网站请求头 + headers = { + 'User-Agent': random.choice(uapools), + } + + return headers + # 标准访问格式 def Link(url, headers): try: diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py index e94fd72..c56f993 100644 --- a/SpringerOpen_spider/SD_main.py +++ b/SpringerOpen_spider/SD_main.py @@ -1,6 +1,5 @@ import urllib -import SD_header import SD_link import SD_threads import SD_save @@ -20,8 +19,8 @@ from urllib.parse import urljoin # 存放网页链接的空列表 -Links = [] # A list for links Webs = [] # A list for web url +Links = [] # A list for links # 存放爬取数据的空列表 Article_data = [] @@ -29,7 +28,7 @@ Author_data = [] # ==========访问论文列表页========== # 获取数学类期刊网站链接 -headers = SD_header.header() +headers = SD_link.header() soup = SD_link.Link('https://www.springeropen.com/journals', headers) hrefs = soup.find('ol', id='Mathematics-list') diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py index b088f5c..713fc76 100644 --- a/SpringerOpen_spider/SD_save.py +++ b/SpringerOpen_spider/SD_save.py @@ -20,6 +20,11 @@ def Transf(): os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True) os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True) + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + for filename in os.listdir(folder_path): if filename.endswith('.json'): file_path = os.path.join(folder_path, filename) @@ -27,16 +32,16 @@ def Transf(): data = json.load(file) # 筛选文章 - data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int( + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int( + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py index c79f41a..d2b8ed2 100644 --- a/SpringerOpen_spider/SD_scrawl.py +++ b/SpringerOpen_spider/SD_scrawl.py @@ -3,7 +3,6 @@ import urllib import uuid from urllib.parse import urljoin -import SD_header import SD_link import SD_detail import SD_save @@ -11,7 +10,7 @@ import SD_save # ==========获取论文详情页链接========== def Scrawl(Link, Article_data, Author_data): # 访问论文列表页 - headers = SD_header.header() + headers = SD_link.header() soup = SD_link.Link(Link, headers) print(Link) From 26fed37e17cce99dfab45043d0bd9d8630e4d7c5 Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Thu, 27 Jul 2023 10:26:02 +0800 Subject: [PATCH 2/4] Modified old code --- EJDE_spider/Transf.py | 38 ---- .../{ejde_scrawler.py => ejde_main.py} | 47 ++--- EJDE_spider/ejde_save.py | 93 ++++++++++ ...tde_href_multithread.py => ejqtde_main.py} | 6 +- EJQTDE_spider/ejqtde_save.py | 33 ++-- ProjectEuclid_spider/projecteuclid_main | 168 ++++++++++++++++++ SpringerOpen_spider/SD_save.py | 30 ++-- 7 files changed, 324 insertions(+), 91 deletions(-) delete mode 100644 EJDE_spider/Transf.py rename EJDE_spider/{ejde_scrawler.py => ejde_main.py} (82%) create mode 100644 EJDE_spider/ejde_save.py rename EJQTDE_spider/{ejqtde_href_multithread.py => ejqtde_main.py} (94%) create mode 100644 ProjectEuclid_spider/projecteuclid_main diff --git a/EJDE_spider/Transf.py b/EJDE_spider/Transf.py deleted file mode 100644 index dce6a10..0000000 --- a/EJDE_spider/Transf.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import json - -# Function -# Get the data from input files -def Read(folder_path): - data = [] - - for filename in os.listdir(folder_path): - if filename.endswith('.json'): - file_path = os.path.join(folder_path, filename) - with open(file_path, 'r', encoding='utf-8') as file: - data.extend(json.load(file)) - return data - -# Write into output files -def Write(data, output_file): - with open(output_file, 'w', encoding='utf-8') as file: - json.dump(data, file, indent=4) - -# Path of files need to be read -folder_path1 = '.\ejde_buffer\Author' -folder_path2 = '.\ejde_buffer\Article' - -# Read the data in the files -Author_data = Read(folder_path1) -Article_data = Read(folder_path2) - -# The path of output files -output_file1 = '.\ejde_buffer\Author_output_file.json' -output_file2 = '.\ejde_buffer\Article_output_file.json' - -# Write into files -Write(Author_data, output_file1) -Write(Article_data, output_file2) - -# End -print("\nData has been written into files.") \ No newline at end of file diff --git a/EJDE_spider/ejde_scrawler.py b/EJDE_spider/ejde_main.py similarity index 82% rename from EJDE_spider/ejde_scrawler.py rename to EJDE_spider/ejde_main.py index 49b95e0..ec679e8 100644 --- a/EJDE_spider/ejde_scrawler.py +++ b/EJDE_spider/ejde_main.py @@ -1,23 +1,22 @@ -import os import uuid import requests -from bs4 import BeautifulSoup import re -import json +import ejde_save + from concurrent.futures import ThreadPoolExecutor, as_completed from retrying import retry +from bs4 import BeautifulSoup +''' + 爬取网站:'ejde.math.txstate.edu' -def save_data(dataset, filetype, filename): - if dataset: - directory = "./ejde_buffer/" + filetype + "/" - os.makedirs(directory, exist_ok=True) - filepath = os.path.join(directory, filename) - with open(filepath, "w", encoding='utf-8') as json_file: - json.dump(dataset, json_file, indent=4) - print(filetype + " data have been added to", filepath) - + ==========运行顺序========== + 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 + 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) +''' +# Article and author detail @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(url): response = requests.get(url) @@ -43,7 +42,7 @@ def process_article(url): # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) - volume = volume_match.group(1) if volume_match else None + volume = str(volume_match.group(1)) if volume_match else None # year = volume_match.group(2) if volume_match else None # Extract pp @@ -141,11 +140,11 @@ def process_article(url): # Save the data periodically based on batch size if len(articleData) % batch_size == 0: - save_data(articleData, "Article", str(uuid.uuid4()) + ".json") + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") articleData.clear() if len(authorData) % batch_size == 0: - save_data(authorData, "Author", str(uuid.uuid4()) + ".json") + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") authorData.clear() @@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l authorData = [] articleData = [] -batch_size = 500 # Number of articles to process before saving +batch_size = 5 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads # Process each URL using multithreading @@ -176,10 +175,14 @@ for future in as_completed(futures): print("An error occurred:", str(e)) # Save remaining data -if articleData: - save_data(articleData, "Article", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/") +if len(articleData) > 0: + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") -if authorData: - save_data(authorData, "Author", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/") +if len(authorData) > 0: + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") + +# Transfer to large file and delete the temporary storage files +ejde_save.Transf() +ejde_save.delete() diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py new file mode 100644 index 0000000..5b67447 --- /dev/null +++ b/EJDE_spider/ejde_save.py @@ -0,0 +1,93 @@ +import os +import json + + +# Save data +def save_data(dataset, filetype, filename): + if dataset: + directory = "./ejde_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# Write into output files +def Transf(): + def Read(folder_path, output_files): + # Create new folders + os.makedirs('./ejde_buffer/Article_output/', exist_ok=True) + os.makedirs('./ejde_buffer/Author_output/', exist_ok=True) + + data_oldest = [] + data_2010_2014 = [] + data_2015_2020 = [] + data_newest = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # The path of reading + author_folder_path = './ejde_buffer/Author_TS' + article_folder_path = './ejde_buffer/Article_TS' + + # The path of storage + author_output_file = [ + './ejde_buffer/Author_output/Author_output_file(oldest).json', + './ejde_buffer/Author_output/Author_output_file(2010-2014).json', + './ejde_buffer/Author_output/Author_output_file(2015-2020).json', + './ejde_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './ejde_buffer/Article_output/Article_output_file(oldest).json', + './ejde_buffer/Article_output/Article_output_file(2010-2014).json', + './ejde_buffer/Article_output/Article_output_file(2015-2020).json', + './ejde_buffer/Article_output/Article_output_file(newest).json' + ] + + # Read and write into files + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + + +# Delete files in temporary storage area +def delete(): + folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_main.py similarity index 94% rename from EJQTDE_spider/ejqtde_href_multithread.py rename to EJQTDE_spider/ejqtde_main.py index 311feca..1a045ff 100644 --- a/EJQTDE_spider/ejqtde_href_multithread.py +++ b/EJQTDE_spider/ejqtde_main.py @@ -12,10 +12,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, wait from urllib.parse import urljoin ''' - 爬取网站:'https://www.math.u-szeged.hu/ejqtde + 爬取网站:'https://www.math.u-szeged.hu/ejqtde' ==========运行顺序========== - 1、ejqtde_href_multithread 获取各年份的期刊链接 + 1、ejqtde_main 获取各年份的期刊链接 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) @@ -70,6 +70,7 @@ with ThreadPoolExecutor(max_workers=25) as executor: wait(futures) print('\nAll links have been got.\n') +# Use multithreading to get the data count1 = 0 count2 = 0 locks = threading.Lock() @@ -107,5 +108,6 @@ print('\nThe whole scrawler program has been done\n') print(count1, ' article_data has been stored.') print(count2, ' author_data has been stored.') +# Transfer to large file and delete the temporary storage files ejqtde_save.Transf() ejqtde_save.delete() \ No newline at end of file diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py index 693159a..5ae8e54 100644 --- a/EJQTDE_spider/ejqtde_save.py +++ b/EJQTDE_spider/ejqtde_save.py @@ -18,7 +18,7 @@ def save_data(dataset, filetype): # Summary files def Transf(): def Read(folder_path, output_files): - # 新建文件夹 + # Create new folder os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True) os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True) @@ -33,25 +33,27 @@ def Transf(): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - # Select data - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # Transfer - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + # Transfer + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # The path of reading author_folder_path = './EJQTDE_buffer/Author_TS' @@ -91,3 +93,4 @@ def delete(): os.remove(file_path) print('\nAttention: The temporary storage files have been deleted!') + diff --git a/ProjectEuclid_spider/projecteuclid_main b/ProjectEuclid_spider/projecteuclid_main new file mode 100644 index 0000000..9ca21b1 --- /dev/null +++ b/ProjectEuclid_spider/projecteuclid_main @@ -0,0 +1,168 @@ +import requests +from bs4 import BeautifulSoup,Tag +import json +import re +import uuid + +main_page_urls = [ + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4", + "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2" + + +] + +all_d_list = [] + +# 遍历多个主页面的链接 +for main_page_url in main_page_urls: + response = requests.get(main_page_url) + html = response.text + soup = BeautifulSoup(html, "html.parser") + + pattern = re.compile(r'^/journals/differential-and-integral-equations/') + links = soup.find_all("a", href=pattern) + sub_urls = [link["href"] for link in links if link["href"].endswith(".full")] + + # 访问子链接并进行信息提取 + for sub_url in sub_urls: + full_sub_url = "https://projecteuclid.org" + sub_url + sub_response = requests.get(full_sub_url) + sub_html = sub_response.text + + # 执行子界面上的信息提取 + sub_soup = BeautifulSoup(sub_html, "html.parser") + + #寻找作者 + author_tags = sub_soup.find_all('meta', {'name': 'citation_author'}) + + authors = {} # 用于存储作者信息的字典 + + #对每一个的作者信息进行处理 + for i, tag in enumerate(author_tags, 1): + citation_author = tag['content'] + authors[i] = citation_author if citation_author else None + + #寻找文章的基本信息 + titles = [] + for title in sub_soup.find_all('meta',{'name':'citation_title'}): + if title.get('content') is not None: + titles.append(title.get('content')) + + + + + #寻找发布时间 + publish_times = [] + for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}): + if publish_time.get('content'): + publish_times.append(str(publish_time.get('content'))) + else: + publish_time.append('None') + + + #寻找关键词 + keywords_list=[] + for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}): + if keywords.get('content'): + keywords_list.append(keywords.get('content')) + else: + keywords_list.append('None') + + + #寻找doi + dois = [] + for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}): + dois.append(str(doi.get('content'))) + doi_separated = ";\n".join(dois) + + + #寻找volume + volumes=[] + for volume in sub_soup.find_all('meta',{'name':'citation_volume'}): + if volume.get('content'): + volumes.append(volume.get('content')) + else: + volumes.append('None') + volume_separated = ";\n".join(volumes) + + #寻找issue + issues=[] + for issue in sub_soup.find_all('meta',{'name':'citation_issue'}): + + issues.append(issue.get('content')) + issue_separated = ";\n".join(issues) + + + #寻找首页 + firstpages=[] + for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}): + firstpages.append(firstpage.get('content')) + + + #寻找尾页 + lastpages=[] + for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}): + lastpages.append(lastpage.get('content')) + + #寻找MSC + MSC=[] + for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}): + MSC.append(msc.get('content')) + MSC_separated = ";\n".join(MSC) + + all_d={"article_id:":str(uuid.uuid4()), + "Author":authors, + "correspond_author":"null", + "Title":titles, + "Publish Time":publish_times, + "keywords":keywords_list, + "DOI":doi_separated , + "volume":volume_separated , + "issue":issue_separated, + "url":full_sub_url, + "page": "-".join(firstpages) + "-" + "-".join(lastpages), + "journal":"projecteuclid.org", + "MSC":MSC_separated} + #print(all_d) + # 写入JSON文件 + + all_d_list.append(all_d) + + # 将信息存储到列表中 + # all_d_list.append(...) + +# 输出存储的信息 +# print(all_d_list) +with open('articles.json', 'w') as f: + json.dump(all_d_list, f, indent=2) + +print("JSON文件已成功生成。") + diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py index 713fc76..0d7119d 100644 --- a/SpringerOpen_spider/SD_save.py +++ b/SpringerOpen_spider/SD_save.py @@ -31,25 +31,27 @@ def Transf(): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - # 筛选文章 - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + for Dict in data: + if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # 筛选文章 + data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # 转存 - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + # 转存 + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # 读取路径 From 07c334a903b152a986bff5b094db5a17f3ea949f Mon Sep 17 00:00:00 2001 From: XCX Date: Thu, 27 Jul 2023 10:28:51 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 该文件已经移动至其他文件夹ProjectEuclid_spider,并且本地已经备份原文件 Signed-off-by: XCX --- projecteuclid_spider | 168 ------------------------------------------- 1 file changed, 168 deletions(-) delete mode 100644 projecteuclid_spider diff --git a/projecteuclid_spider b/projecteuclid_spider deleted file mode 100644 index 9ca21b1..0000000 --- a/projecteuclid_spider +++ /dev/null @@ -1,168 +0,0 @@ -import requests -from bs4 import BeautifulSoup,Tag -import json -import re -import uuid - -main_page_urls = [ - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4", - "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2" - - -] - -all_d_list = [] - -# 遍历多个主页面的链接 -for main_page_url in main_page_urls: - response = requests.get(main_page_url) - html = response.text - soup = BeautifulSoup(html, "html.parser") - - pattern = re.compile(r'^/journals/differential-and-integral-equations/') - links = soup.find_all("a", href=pattern) - sub_urls = [link["href"] for link in links if link["href"].endswith(".full")] - - # 访问子链接并进行信息提取 - for sub_url in sub_urls: - full_sub_url = "https://projecteuclid.org" + sub_url - sub_response = requests.get(full_sub_url) - sub_html = sub_response.text - - # 执行子界面上的信息提取 - sub_soup = BeautifulSoup(sub_html, "html.parser") - - #寻找作者 - author_tags = sub_soup.find_all('meta', {'name': 'citation_author'}) - - authors = {} # 用于存储作者信息的字典 - - #对每一个的作者信息进行处理 - for i, tag in enumerate(author_tags, 1): - citation_author = tag['content'] - authors[i] = citation_author if citation_author else None - - #寻找文章的基本信息 - titles = [] - for title in sub_soup.find_all('meta',{'name':'citation_title'}): - if title.get('content') is not None: - titles.append(title.get('content')) - - - - - #寻找发布时间 - publish_times = [] - for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}): - if publish_time.get('content'): - publish_times.append(str(publish_time.get('content'))) - else: - publish_time.append('None') - - - #寻找关键词 - keywords_list=[] - for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}): - if keywords.get('content'): - keywords_list.append(keywords.get('content')) - else: - keywords_list.append('None') - - - #寻找doi - dois = [] - for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}): - dois.append(str(doi.get('content'))) - doi_separated = ";\n".join(dois) - - - #寻找volume - volumes=[] - for volume in sub_soup.find_all('meta',{'name':'citation_volume'}): - if volume.get('content'): - volumes.append(volume.get('content')) - else: - volumes.append('None') - volume_separated = ";\n".join(volumes) - - #寻找issue - issues=[] - for issue in sub_soup.find_all('meta',{'name':'citation_issue'}): - - issues.append(issue.get('content')) - issue_separated = ";\n".join(issues) - - - #寻找首页 - firstpages=[] - for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}): - firstpages.append(firstpage.get('content')) - - - #寻找尾页 - lastpages=[] - for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}): - lastpages.append(lastpage.get('content')) - - #寻找MSC - MSC=[] - for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}): - MSC.append(msc.get('content')) - MSC_separated = ";\n".join(MSC) - - all_d={"article_id:":str(uuid.uuid4()), - "Author":authors, - "correspond_author":"null", - "Title":titles, - "Publish Time":publish_times, - "keywords":keywords_list, - "DOI":doi_separated , - "volume":volume_separated , - "issue":issue_separated, - "url":full_sub_url, - "page": "-".join(firstpages) + "-" + "-".join(lastpages), - "journal":"projecteuclid.org", - "MSC":MSC_separated} - #print(all_d) - # 写入JSON文件 - - all_d_list.append(all_d) - - # 将信息存储到列表中 - # all_d_list.append(...) - -# 输出存储的信息 -# print(all_d_list) -with open('articles.json', 'w') as f: - json.dump(all_d_list, f, indent=2) - -print("JSON文件已成功生成。") - From c1e1e59e052f5ef4d5f059873400f9b66323a43a Mon Sep 17 00:00:00 2001 From: XCX Date: Thu, 27 Jul 2023 10:30:26 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20EJQTDE=5Fspider/ejqtde?= =?UTF-8?q?=5Fmain.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EJQTDE_spider/ejqtde_main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/EJQTDE_spider/ejqtde_main.py b/EJQTDE_spider/ejqtde_main.py index 1a045ff..4917e6c 100644 --- a/EJQTDE_spider/ejqtde_main.py +++ b/EJQTDE_spider/ejqtde_main.py @@ -15,10 +15,10 @@ from urllib.parse import urljoin 爬取网站:'https://www.math.u-szeged.hu/ejqtde' ==========运行顺序========== - 1、ejqtde_main 获取各年份的期刊链接 - 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 - 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 - *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) + 1、ejqtde_main 获取各年份的期刊链接 + 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 + 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) '''