From cfa9345a797868844ed200b458b11b801421b8f9 Mon Sep 17 00:00:00 2001
From: XCX <1361986662@qq.com>
Date: Wed, 26 Jul 2023 23:25:30 +0800
Subject: [PATCH 1/4] Update a new spider code for math.u-szeged.hu/ejqtde.
Modified the code of SpringerOpen_spider
---
EJQTDE_spider/ejqtde_href_multithread.py | 111 ++++++++++++++
EJQTDE_spider/ejqtde_save.py | 93 +++++++++++
EJQTDE_spider/ejqtde_scrawler.py | 187 +++++++++++++++++++++++
SpringerOpen_spider/SD_detail.py | 4 +-
SpringerOpen_spider/SD_header.py | 25 ---
SpringerOpen_spider/SD_link.py | 22 +++
SpringerOpen_spider/SD_main.py | 5 +-
SpringerOpen_spider/SD_save.py | 13 +-
SpringerOpen_spider/SD_scrawl.py | 3 +-
9 files changed, 427 insertions(+), 36 deletions(-)
create mode 100644 EJQTDE_spider/ejqtde_href_multithread.py
create mode 100644 EJQTDE_spider/ejqtde_save.py
create mode 100644 EJQTDE_spider/ejqtde_scrawler.py
delete mode 100644 SpringerOpen_spider/SD_header.py
diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_href_multithread.py
new file mode 100644
index 0000000..311feca
--- /dev/null
+++ b/EJQTDE_spider/ejqtde_href_multithread.py
@@ -0,0 +1,111 @@
+import re
+import datetime
+import threading
+import urllib
+import ejqtde_scrawler
+import ejqtde_save
+
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from selenium.webdriver.edge.options import Options
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait
+from urllib.parse import urljoin
+
+'''
+ 爬取网站:'https://www.math.u-szeged.hu/ejqtde
+
+ ==========运行顺序==========
+ 1、ejqtde_href_multithread 获取各年份的期刊链接
+ 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
+ 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
+ *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
+'''
+
+
+# Multithread pool
+def extract_href(link):
+ driver = webdriver.Edge(options=options)
+ driver.get(link)
+ html_code = driver.page_source
+ soup = BeautifulSoup(html_code, 'html.parser')
+ column_right = soup.find('div', id='columnRight')
+ if column_right:
+ ordered_lists = column_right.find_all('ol')
+ for idx, ordered_list in enumerate(ordered_lists, 1):
+ for list_item in ordered_list.find_all('li'):
+ matches = re.findall(r': = 50:
+ with locks:
+ count1 += len(Article_list)
+ ejqtde_save.save_data(Article_list, "Article_TS")
+ Article_list.clear()
+
+ if len(Author_list) >= 50:
+ with locks:
+ count2 += len(Author_list)
+ ejqtde_save.save_data(Author_list, "Author_TS")
+ Author_list.clear()
+ wait(futures)
+
+ # Deal with the remaining data
+ if len(Article_list) > 0:
+ count1 += len(Article_list)
+ ejqtde_save.save_data(Article_list, "Article_TS")
+ Article_list.clear()
+ print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
+ if len(Author_list) > 0:
+ count2 += len(Author_list)
+ ejqtde_save.save_data(Author_list, "Author_TS")
+ Author_list.clear()
+ print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')
+
+print('\nThe whole scrawler program has been done\n')
+print(count1, ' article_data has been stored.')
+print(count2, ' author_data has been stored.')
+
+ejqtde_save.Transf()
+ejqtde_save.delete()
\ No newline at end of file
diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py
new file mode 100644
index 0000000..693159a
--- /dev/null
+++ b/EJQTDE_spider/ejqtde_save.py
@@ -0,0 +1,93 @@
+import os
+import json
+import uuid
+
+
+# Save into files
+def save_data(dataset, filetype):
+ if dataset:
+ filename = str(uuid.uuid4()) + ".json"
+ directory = "./EJQTDE_buffer/" + filetype + "/"
+ os.makedirs(directory, exist_ok=True)
+ filepath = os.path.join(directory, filename)
+ with open(filepath, "w", encoding='utf-8') as json_file:
+ json.dump(dataset, json_file, indent=4)
+ print(filetype + " data have been added to", filepath)
+
+
+# Summary files
+def Transf():
+ def Read(folder_path, output_files):
+ # 新建文件夹
+ os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
+ os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
+
+ data_oldest = []
+ data_2010_2014 = []
+ data_2015_2020 = []
+ data_newest = []
+
+ for filename in os.listdir(folder_path):
+ if filename.endswith('.json'):
+ file_path = os.path.join(folder_path, filename)
+ with open(file_path, 'r', encoding='utf-8') as file:
+ data = json.load(file)
+
+ # Select data
+ data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+
+ data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+
+ data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+
+ data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+
+ Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+
+ # Transfer
+ for index in range(0, 4):
+ with open(output_files[index], 'w', encoding='utf-8') as file:
+ json.dump(Data[index], file, indent=4)
+
+ # The path of reading
+ author_folder_path = './EJQTDE_buffer/Author_TS'
+ article_folder_path = './EJQTDE_buffer/Article_TS'
+
+ # The path of storage
+ author_output_file = [
+ './EJQTDE_buffer/Author_output/Author_output_file(oldest).json',
+ './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json',
+ './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json',
+ './EJQTDE_buffer/Author_output/Author_output_file(newest).json'
+ ]
+
+ article_output_file = [
+ './EJQTDE_buffer/Article_output/Article_output_file(oldest).json',
+ './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json',
+ './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json',
+ './EJQTDE_buffer/Article_output/Article_output_file(newest).json'
+ ]
+
+ # Read and write into files
+ Read(author_folder_path, author_output_file)
+ Read(article_folder_path, article_output_file)
+
+ # End
+ print("\nData has been written into files.")
+
+
+# Delete files in temporary storage area
+def delete():
+ folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS']
+ for folder_path in folder_paths:
+ file_names = os.listdir(folder_path)
+ for file_name in file_names:
+ file_path = os.path.join(folder_path, file_name)
+ if os.path.isfile(file_path):
+ os.remove(file_path)
+
+ print('\nAttention: The temporary storage files have been deleted!')
diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py
new file mode 100644
index 0000000..1f3975d
--- /dev/null
+++ b/EJQTDE_spider/ejqtde_scrawler.py
@@ -0,0 +1,187 @@
+import time
+import uuid
+import re
+import urllib
+
+from selenium.webdriver.edge.options import Options
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+# Get the information in the webpage through selenium
+def source(driver, num):
+ if driver.find_elements(by='id', value='columnRight'):
+ html_code = driver.page_source
+ soup = BeautifulSoup(html_code, 'html.parser')
+ return soup
+ elif num == 5:
+ print('Out of times!')
+ driver.quit()
+ return None
+ else:
+ num += 1
+ time.sleep(3)
+ return source(driver, num)
+
+
+# Get the links of the authors' information
+def author_links(Data):
+ Author_links = []
+ Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&'
+ 'paramtipus_ertek=person_data¶m_ertek=\d+')
+ Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
+ for Author_href in Author_hrefs:
+ Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
+ Author_links.append(Author_href)
+
+ return Author_links
+
+
+# Get the information of the authors
+def author_detail(Data, Year, article_id, Author_list):
+ # Name
+ author = Data.find('p', class_='publication_head').get_text()
+
+ author = author.split(',')
+ author = [char.replace(' ', '') for char in author]
+
+ Firstname = author[0]
+ Lastname = author[-1]
+ Middlename = ''.join(author[1:-1]) if author[1:-1] else None
+
+ # infor
+ table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
+ Td = table.find_all('td')
+ line = [td for td in Td]
+
+ # Affiliation
+ Affiliation = line[1].get_text()
+
+ # Email
+ Email = line[0].find('a').get('href')
+
+ author_data = {
+ "author_id": str(uuid.uuid4()),
+ "from_article": article_id,
+ "firstname": Firstname,
+ "lastname": Lastname,
+ "middlename": Middlename,
+ "affiliation": [
+ {
+ "year": Year,
+ "affiliation": Affiliation,
+ "email": Email
+ }
+ ]
+ }
+
+ Author_list.append(author_data)
+ return Author_list
+
+
+# Get the article's information
+def article_detail(Data, URL, article_id, Aricle_list):
+ # Title
+ font = Data.find('font', attrs={'size': '+1'})
+ Title = font.find('b').get_text()
+
+ # Author and Corresponding_authors
+ author_pattern = re.compile(r'periodica\.html\?periodica=1&'
+ r'paramtipus_ertek=person_data¶m_ertek=\d+">(.*?)')
+ Author = re.findall(author_pattern, str(Data))
+ Corresponding_author = Author[-1] # Corresponding_authors
+ del Author[-1]
+
+ # Submit_datetime and publish_datetime
+ time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
+ time = re.findall(r'\d+-\d+-\d+', str(time))
+ Submit_date = time[0] if time[0] else None
+ Publish_date = time[1] if time[1] else None
+
+ # Keyword
+ Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
+ Keyword = Keyword.split(', ') if Keyword is not None else None
+
+ # MSC
+ MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
+ MSC = MSC.split(', ') if MSC is not None else None
+
+ # DOI
+ if len(re.findall(r' 0:
+ DOI = re.findall(r'(\d+)', str(Data))[0]
+
+ # Issue and page
+ result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
+ Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
+ Page = re.findall(r'\d+, (\d+-\d+)', result)[0]
+
+ article_data = {
+ "article_id": article_id,
+ "title": Title,
+ "authors": Author,
+ "corresponding_authors": Corresponding_author,
+ "submit_datetime": Submit_date,
+ "publish_datetime": Publish_date,
+ "keywords": Keyword,
+ "MSC": MSC,
+ "URL": URL,
+ "DOI": DOI,
+ "publisher": Publisher,
+ "journal": Journal,
+ "volume": Volume,
+ "issue": Issue,
+ "page": Page,
+ }
+
+ Aricle_list.append(article_data)
+ return Aricle_list
+
+
+# Main code of scrawler
+def scrawler(URL, lock, Article_list, Author_list):
+ print('Start: ', URL)
+ driver = webdriver.Edge(options=options)
+ driver.get(URL)
+
+ # Enter the detail page
+ Max_retryTimes = 3
+ Essay_data = source(driver, Max_retryTimes)
+ if Essay_data is not None:
+ article_id = str(uuid.uuid4())
+ Article_list = article_detail(Essay_data, URL, article_id, Article_list)
+
+ # Get the authors' information
+ Year = re.findall(r'(\d+)', str(Essay_data))[0]
+ for author_link in author_links(Essay_data):
+ driver.get(author_link)
+ Author_detail = source(driver, Max_retryTimes)
+ Author_list = author_detail(Author_detail, Year, article_id, Author_list)
+
+
+ print('Complete: ', URL)
+ driver.quit()
+
+ else:
+ print('Wrong: Some error occurred: ', URL)
+ pass
+
+
+# Options setting
+options = Options()
+options.add_argument('--headless') # Run Edge in headless mode
+options.add_argument('disable-gpu') # Disable GPU acceleration
+options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'
+
+
diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py
index 8dd02dc..73639c7 100644
--- a/SpringerOpen_spider/SD_detail.py
+++ b/SpringerOpen_spider/SD_detail.py
@@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id):
time = time.get_text()
Time.append(time)
- Submitted_date = Time[0]
+ Submit_date = Time[0]
Publish_date = Time[-1]
# keyword
@@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id):
"title": Title,
"authors": Author,
"corresponding_authors": Corresponding_author,
- "submit_datetime": Submitted_date,
+ "submit_datetime": Submit_date,
"publish_datetime": Publish_date,
"keywords": Keyword,
"MSC": MSC,
diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py
deleted file mode 100644
index 162e82d..0000000
--- a/SpringerOpen_spider/SD_header.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import random
-
-# 用户代理地址池
-uapools=[
- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
- "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
- 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
- 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
-]
-
-def header():
- # 网站请求头
- headers = {
- 'User-Agent': random.choice(uapools),
- }
-
- return headers
-
-
-
diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py
index a46d542..1b5eed1 100644
--- a/SpringerOpen_spider/SD_link.py
+++ b/SpringerOpen_spider/SD_link.py
@@ -1,6 +1,28 @@
+import random
import requests
from bs4 import BeautifulSoup
+# 用户代理地址池
+uapools=[
+ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
+ "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+ 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+ 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+]
+
+def header():
+ # 网站请求头
+ headers = {
+ 'User-Agent': random.choice(uapools),
+ }
+
+ return headers
+
# 标准访问格式
def Link(url, headers):
try:
diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py
index e94fd72..c56f993 100644
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@@ -1,6 +1,5 @@
import urllib
-import SD_header
import SD_link
import SD_threads
import SD_save
@@ -20,8 +19,8 @@ from urllib.parse import urljoin
# 存放网页链接的空列表
-Links = [] # A list for links
Webs = [] # A list for web url
+Links = [] # A list for links
# 存放爬取数据的空列表
Article_data = []
@@ -29,7 +28,7 @@ Author_data = []
# ==========访问论文列表页==========
# 获取数学类期刊网站链接
-headers = SD_header.header()
+headers = SD_link.header()
soup = SD_link.Link('https://www.springeropen.com/journals', headers)
hrefs = soup.find('ol', id='Mathematics-list')
diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py
index b088f5c..713fc76 100644
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@@ -20,6 +20,11 @@ def Transf():
os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
+ data_oldest = []
+ data_2010_2014 = []
+ data_2015_2020 = []
+ data_newest = []
+
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
@@ -27,16 +32,16 @@ def Transf():
data = json.load(file)
# 筛选文章
- data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
- data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+ data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
- data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+ data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
- data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py
index c79f41a..d2b8ed2 100644
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@@ -3,7 +3,6 @@ import urllib
import uuid
from urllib.parse import urljoin
-import SD_header
import SD_link
import SD_detail
import SD_save
@@ -11,7 +10,7 @@ import SD_save
# ==========获取论文详情页链接==========
def Scrawl(Link, Article_data, Author_data):
# 访问论文列表页
- headers = SD_header.header()
+ headers = SD_link.header()
soup = SD_link.Link(Link, headers)
print(Link)
From 26fed37e17cce99dfab45043d0bd9d8630e4d7c5 Mon Sep 17 00:00:00 2001
From: XCX <1361986662@qq.com>
Date: Thu, 27 Jul 2023 10:26:02 +0800
Subject: [PATCH 2/4] Modified old code
---
EJDE_spider/Transf.py | 38 ----
.../{ejde_scrawler.py => ejde_main.py} | 47 ++---
EJDE_spider/ejde_save.py | 93 ++++++++++
...tde_href_multithread.py => ejqtde_main.py} | 6 +-
EJQTDE_spider/ejqtde_save.py | 33 ++--
ProjectEuclid_spider/projecteuclid_main | 168 ++++++++++++++++++
SpringerOpen_spider/SD_save.py | 30 ++--
7 files changed, 324 insertions(+), 91 deletions(-)
delete mode 100644 EJDE_spider/Transf.py
rename EJDE_spider/{ejde_scrawler.py => ejde_main.py} (82%)
create mode 100644 EJDE_spider/ejde_save.py
rename EJQTDE_spider/{ejqtde_href_multithread.py => ejqtde_main.py} (94%)
create mode 100644 ProjectEuclid_spider/projecteuclid_main
diff --git a/EJDE_spider/Transf.py b/EJDE_spider/Transf.py
deleted file mode 100644
index dce6a10..0000000
--- a/EJDE_spider/Transf.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import os
-import json
-
-# Function
-# Get the data from input files
-def Read(folder_path):
- data = []
-
- for filename in os.listdir(folder_path):
- if filename.endswith('.json'):
- file_path = os.path.join(folder_path, filename)
- with open(file_path, 'r', encoding='utf-8') as file:
- data.extend(json.load(file))
- return data
-
-# Write into output files
-def Write(data, output_file):
- with open(output_file, 'w', encoding='utf-8') as file:
- json.dump(data, file, indent=4)
-
-# Path of files need to be read
-folder_path1 = '.\ejde_buffer\Author'
-folder_path2 = '.\ejde_buffer\Article'
-
-# Read the data in the files
-Author_data = Read(folder_path1)
-Article_data = Read(folder_path2)
-
-# The path of output files
-output_file1 = '.\ejde_buffer\Author_output_file.json'
-output_file2 = '.\ejde_buffer\Article_output_file.json'
-
-# Write into files
-Write(Author_data, output_file1)
-Write(Article_data, output_file2)
-
-# End
-print("\nData has been written into files.")
\ No newline at end of file
diff --git a/EJDE_spider/ejde_scrawler.py b/EJDE_spider/ejde_main.py
similarity index 82%
rename from EJDE_spider/ejde_scrawler.py
rename to EJDE_spider/ejde_main.py
index 49b95e0..ec679e8 100644
--- a/EJDE_spider/ejde_scrawler.py
+++ b/EJDE_spider/ejde_main.py
@@ -1,23 +1,22 @@
-import os
import uuid
import requests
-from bs4 import BeautifulSoup
import re
-import json
+import ejde_save
+
from concurrent.futures import ThreadPoolExecutor, as_completed
from retrying import retry
+from bs4 import BeautifulSoup
+'''
+ 爬取网站:'ejde.math.txstate.edu'
-def save_data(dataset, filetype, filename):
- if dataset:
- directory = "./ejde_buffer/" + filetype + "/"
- os.makedirs(directory, exist_ok=True)
- filepath = os.path.join(directory, filename)
- with open(filepath, "w", encoding='utf-8') as json_file:
- json.dump(dataset, json_file, indent=4)
- print(filetype + " data have been added to", filepath)
-
+ ==========运行顺序==========
+ 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
+ 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
+ *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
+'''
+# Article and author detail
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(url):
response = requests.get(url)
@@ -43,7 +42,7 @@ def process_article(url):
# Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
- volume = volume_match.group(1) if volume_match else None
+ volume = str(volume_match.group(1)) if volume_match else None
# year = volume_match.group(2) if volume_match else None
# Extract pp
@@ -141,11 +140,11 @@ def process_article(url):
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
- save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
+ ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleData.clear()
if len(authorData) % batch_size == 0:
- save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
+ ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorData.clear()
@@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
authorData = []
articleData = []
-batch_size = 500 # Number of articles to process before saving
+batch_size = 5 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
# Process each URL using multithreading
@@ -176,10 +175,14 @@ for future in as_completed(futures):
print("An error occurred:", str(e))
# Save remaining data
-if articleData:
- save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
- print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
+if len(articleData) > 0:
+ ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+ print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
-if authorData:
- save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
- print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
+if len(authorData) > 0:
+ ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+ print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
+
+# Transfer to large file and delete the temporary storage files
+ejde_save.Transf()
+ejde_save.delete()
diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py
new file mode 100644
index 0000000..5b67447
--- /dev/null
+++ b/EJDE_spider/ejde_save.py
@@ -0,0 +1,93 @@
+import os
+import json
+
+
+# Save data
+def save_data(dataset, filetype, filename):
+ if dataset:
+ directory = "./ejde_buffer/" + filetype + "/"
+ os.makedirs(directory, exist_ok=True)
+ filepath = os.path.join(directory, filename)
+ with open(filepath, "w", encoding='utf-8') as json_file:
+ json.dump(dataset, json_file, indent=4)
+ print(filetype + " data have been added to", filepath)
+
+
+# Write into output files
+def Transf():
+ def Read(folder_path, output_files):
+ # Create new folders
+ os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
+ os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
+
+ data_oldest = []
+ data_2010_2014 = []
+ data_2015_2020 = []
+ data_newest = []
+
+ for filename in os.listdir(folder_path):
+ if filename.endswith('.json'):
+ file_path = os.path.join(folder_path, filename)
+ with open(file_path, 'r', encoding='utf-8') as file:
+ data = json.load(file)
+
+ for Dict in data:
+ if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+ # Select data
+ data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+
+ data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+
+ data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+
+ data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+
+ Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+
+ # Transfer
+ for index in range(0, 4):
+ with open(output_files[index], 'w', encoding='utf-8') as file:
+ json.dump(Data[index], file, indent=4)
+
+ # The path of reading
+ author_folder_path = './ejde_buffer/Author_TS'
+ article_folder_path = './ejde_buffer/Article_TS'
+
+ # The path of storage
+ author_output_file = [
+ './ejde_buffer/Author_output/Author_output_file(oldest).json',
+ './ejde_buffer/Author_output/Author_output_file(2010-2014).json',
+ './ejde_buffer/Author_output/Author_output_file(2015-2020).json',
+ './ejde_buffer/Author_output/Author_output_file(newest).json'
+ ]
+
+ article_output_file = [
+ './ejde_buffer/Article_output/Article_output_file(oldest).json',
+ './ejde_buffer/Article_output/Article_output_file(2010-2014).json',
+ './ejde_buffer/Article_output/Article_output_file(2015-2020).json',
+ './ejde_buffer/Article_output/Article_output_file(newest).json'
+ ]
+
+ # Read and write into files
+ Read(author_folder_path, author_output_file)
+ Read(article_folder_path, article_output_file)
+
+ # End
+ print("\nData has been written into files.")
+
+
+# Delete files in temporary storage area
+def delete():
+ folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
+ for folder_path in folder_paths:
+ file_names = os.listdir(folder_path)
+ for file_name in file_names:
+ file_path = os.path.join(folder_path, file_name)
+ if os.path.isfile(file_path):
+ os.remove(file_path)
+
+ print('\nAttention: The temporary storage files have been deleted!')
diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_main.py
similarity index 94%
rename from EJQTDE_spider/ejqtde_href_multithread.py
rename to EJQTDE_spider/ejqtde_main.py
index 311feca..1a045ff 100644
--- a/EJQTDE_spider/ejqtde_href_multithread.py
+++ b/EJQTDE_spider/ejqtde_main.py
@@ -12,10 +12,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from urllib.parse import urljoin
'''
- 爬取网站:'https://www.math.u-szeged.hu/ejqtde
+ 爬取网站:'https://www.math.u-szeged.hu/ejqtde'
==========运行顺序==========
- 1、ejqtde_href_multithread 获取各年份的期刊链接
+ 1、ejqtde_main 获取各年份的期刊链接
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
@@ -70,6 +70,7 @@ with ThreadPoolExecutor(max_workers=25) as executor:
wait(futures)
print('\nAll links have been got.\n')
+# Use multithreading to get the data
count1 = 0
count2 = 0
locks = threading.Lock()
@@ -107,5 +108,6 @@ print('\nThe whole scrawler program has been done\n')
print(count1, ' article_data has been stored.')
print(count2, ' author_data has been stored.')
+# Transfer to large file and delete the temporary storage files
ejqtde_save.Transf()
ejqtde_save.delete()
\ No newline at end of file
diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py
index 693159a..5ae8e54 100644
--- a/EJQTDE_spider/ejqtde_save.py
+++ b/EJQTDE_spider/ejqtde_save.py
@@ -18,7 +18,7 @@ def save_data(dataset, filetype):
# Summary files
def Transf():
def Read(folder_path, output_files):
- # 新建文件夹
+ # Create new folder
os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
@@ -33,25 +33,27 @@ def Transf():
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
- # Select data
- data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+ for Dict in data:
+ if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+ # Select data
+ data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
- data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+ data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
- data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+ data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
- data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+ data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
- Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+ Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
- # Transfer
- for index in range(0, 4):
- with open(output_files[index], 'w', encoding='utf-8') as file:
- json.dump(Data[index], file, indent=4)
+ # Transfer
+ for index in range(0, 4):
+ with open(output_files[index], 'w', encoding='utf-8') as file:
+ json.dump(Data[index], file, indent=4)
# The path of reading
author_folder_path = './EJQTDE_buffer/Author_TS'
@@ -91,3 +93,4 @@ def delete():
os.remove(file_path)
print('\nAttention: The temporary storage files have been deleted!')
+
diff --git a/ProjectEuclid_spider/projecteuclid_main b/ProjectEuclid_spider/projecteuclid_main
new file mode 100644
index 0000000..9ca21b1
--- /dev/null
+++ b/ProjectEuclid_spider/projecteuclid_main
@@ -0,0 +1,168 @@
+import requests
+from bs4 import BeautifulSoup,Tag
+import json
+import re
+import uuid
+
+main_page_urls = [
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
+ "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
+
+
+]
+
+all_d_list = []
+
+# 遍历多个主页面的链接
+for main_page_url in main_page_urls:
+ response = requests.get(main_page_url)
+ html = response.text
+ soup = BeautifulSoup(html, "html.parser")
+
+ pattern = re.compile(r'^/journals/differential-and-integral-equations/')
+ links = soup.find_all("a", href=pattern)
+ sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
+
+ # 访问子链接并进行信息提取
+ for sub_url in sub_urls:
+ full_sub_url = "https://projecteuclid.org" + sub_url
+ sub_response = requests.get(full_sub_url)
+ sub_html = sub_response.text
+
+ # 执行子界面上的信息提取
+ sub_soup = BeautifulSoup(sub_html, "html.parser")
+
+ #寻找作者
+ author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
+
+ authors = {} # 用于存储作者信息的字典
+
+ #对每一个的作者信息进行处理
+ for i, tag in enumerate(author_tags, 1):
+ citation_author = tag['content']
+ authors[i] = citation_author if citation_author else None
+
+ #寻找文章的基本信息
+ titles = []
+ for title in sub_soup.find_all('meta',{'name':'citation_title'}):
+ if title.get('content') is not None:
+ titles.append(title.get('content'))
+
+
+
+
+ #寻找发布时间
+ publish_times = []
+ for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
+ if publish_time.get('content'):
+ publish_times.append(str(publish_time.get('content')))
+ else:
+ publish_time.append('None')
+
+
+ #寻找关键词
+ keywords_list=[]
+ for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
+ if keywords.get('content'):
+ keywords_list.append(keywords.get('content'))
+ else:
+ keywords_list.append('None')
+
+
+ #寻找doi
+ dois = []
+ for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
+ dois.append(str(doi.get('content')))
+ doi_separated = ";\n".join(dois)
+
+
+ #寻找volume
+ volumes=[]
+ for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
+ if volume.get('content'):
+ volumes.append(volume.get('content'))
+ else:
+ volumes.append('None')
+ volume_separated = ";\n".join(volumes)
+
+ #寻找issue
+ issues=[]
+ for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
+
+ issues.append(issue.get('content'))
+ issue_separated = ";\n".join(issues)
+
+
+ #寻找首页
+ firstpages=[]
+ for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
+ firstpages.append(firstpage.get('content'))
+
+
+ #寻找尾页
+ lastpages=[]
+ for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
+ lastpages.append(lastpage.get('content'))
+
+ #寻找MSC
+ MSC=[]
+ for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
+ MSC.append(msc.get('content'))
+ MSC_separated = ";\n".join(MSC)
+
+ all_d={"article_id:":str(uuid.uuid4()),
+ "Author":authors,
+ "correspond_author":"null",
+ "Title":titles,
+ "Publish Time":publish_times,
+ "keywords":keywords_list,
+ "DOI":doi_separated ,
+ "volume":volume_separated ,
+ "issue":issue_separated,
+ "url":full_sub_url,
+ "page": "-".join(firstpages) + "-" + "-".join(lastpages),
+ "journal":"projecteuclid.org",
+ "MSC":MSC_separated}
+ #print(all_d)
+ # 写入JSON文件
+
+ all_d_list.append(all_d)
+
+ # 将信息存储到列表中
+ # all_d_list.append(...)
+
+# 输出存储的信息
+# print(all_d_list)
+with open('articles.json', 'w') as f:
+ json.dump(all_d_list, f, indent=2)
+
+print("JSON文件已成功生成。")
+
diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py
index 713fc76..0d7119d 100644
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@@ -31,25 +31,27 @@ def Transf():
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
- # 筛选文章
- data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+ for Dict in data:
+ if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+ # 筛选文章
+ data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
- data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+ data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
- data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+ data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
- data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
- Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+ data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+ Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
- Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+ Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
- # 转存
- for index in range(0, 4):
- with open(output_files[index], 'w', encoding='utf-8') as file:
- json.dump(Data[index], file, indent=4)
+ # 转存
+ for index in range(0, 4):
+ with open(output_files[index], 'w', encoding='utf-8') as file:
+ json.dump(Data[index], file, indent=4)
# 读取路径
From 07c334a903b152a986bff5b094db5a17f3ea949f Mon Sep 17 00:00:00 2001
From: XCX
Date: Thu, 27 Jul 2023 10:28:51 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
该文件已经移动至其他文件夹ProjectEuclid_spider,并且本地已经备份原文件
Signed-off-by: XCX
---
projecteuclid_spider | 168 -------------------------------------------
1 file changed, 168 deletions(-)
delete mode 100644 projecteuclid_spider
diff --git a/projecteuclid_spider b/projecteuclid_spider
deleted file mode 100644
index 9ca21b1..0000000
--- a/projecteuclid_spider
+++ /dev/null
@@ -1,168 +0,0 @@
-import requests
-from bs4 import BeautifulSoup,Tag
-import json
-import re
-import uuid
-
-main_page_urls = [
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
- "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
-
-
-]
-
-all_d_list = []
-
-# 遍历多个主页面的链接
-for main_page_url in main_page_urls:
- response = requests.get(main_page_url)
- html = response.text
- soup = BeautifulSoup(html, "html.parser")
-
- pattern = re.compile(r'^/journals/differential-and-integral-equations/')
- links = soup.find_all("a", href=pattern)
- sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
-
- # 访问子链接并进行信息提取
- for sub_url in sub_urls:
- full_sub_url = "https://projecteuclid.org" + sub_url
- sub_response = requests.get(full_sub_url)
- sub_html = sub_response.text
-
- # 执行子界面上的信息提取
- sub_soup = BeautifulSoup(sub_html, "html.parser")
-
- #寻找作者
- author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
-
- authors = {} # 用于存储作者信息的字典
-
- #对每一个的作者信息进行处理
- for i, tag in enumerate(author_tags, 1):
- citation_author = tag['content']
- authors[i] = citation_author if citation_author else None
-
- #寻找文章的基本信息
- titles = []
- for title in sub_soup.find_all('meta',{'name':'citation_title'}):
- if title.get('content') is not None:
- titles.append(title.get('content'))
-
-
-
-
- #寻找发布时间
- publish_times = []
- for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
- if publish_time.get('content'):
- publish_times.append(str(publish_time.get('content')))
- else:
- publish_time.append('None')
-
-
- #寻找关键词
- keywords_list=[]
- for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
- if keywords.get('content'):
- keywords_list.append(keywords.get('content'))
- else:
- keywords_list.append('None')
-
-
- #寻找doi
- dois = []
- for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
- dois.append(str(doi.get('content')))
- doi_separated = ";\n".join(dois)
-
-
- #寻找volume
- volumes=[]
- for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
- if volume.get('content'):
- volumes.append(volume.get('content'))
- else:
- volumes.append('None')
- volume_separated = ";\n".join(volumes)
-
- #寻找issue
- issues=[]
- for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
-
- issues.append(issue.get('content'))
- issue_separated = ";\n".join(issues)
-
-
- #寻找首页
- firstpages=[]
- for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
- firstpages.append(firstpage.get('content'))
-
-
- #寻找尾页
- lastpages=[]
- for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
- lastpages.append(lastpage.get('content'))
-
- #寻找MSC
- MSC=[]
- for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
- MSC.append(msc.get('content'))
- MSC_separated = ";\n".join(MSC)
-
- all_d={"article_id:":str(uuid.uuid4()),
- "Author":authors,
- "correspond_author":"null",
- "Title":titles,
- "Publish Time":publish_times,
- "keywords":keywords_list,
- "DOI":doi_separated ,
- "volume":volume_separated ,
- "issue":issue_separated,
- "url":full_sub_url,
- "page": "-".join(firstpages) + "-" + "-".join(lastpages),
- "journal":"projecteuclid.org",
- "MSC":MSC_separated}
- #print(all_d)
- # 写入JSON文件
-
- all_d_list.append(all_d)
-
- # 将信息存储到列表中
- # all_d_list.append(...)
-
-# 输出存储的信息
-# print(all_d_list)
-with open('articles.json', 'w') as f:
- json.dump(all_d_list, f, indent=2)
-
-print("JSON文件已成功生成。")
-
From c1e1e59e052f5ef4d5f059873400f9b66323a43a Mon Sep 17 00:00:00 2001
From: XCX
Date: Thu, 27 Jul 2023 10:30:26 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20EJQTDE=5Fspider/ejqtde?=
=?UTF-8?q?=5Fmain.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
EJQTDE_spider/ejqtde_main.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/EJQTDE_spider/ejqtde_main.py b/EJQTDE_spider/ejqtde_main.py
index 1a045ff..4917e6c 100644
--- a/EJQTDE_spider/ejqtde_main.py
+++ b/EJQTDE_spider/ejqtde_main.py
@@ -15,10 +15,10 @@ from urllib.parse import urljoin
爬取网站:'https://www.math.u-szeged.hu/ejqtde'
==========运行顺序==========
- 1、ejqtde_main 获取各年份的期刊链接
- 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
- 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
- *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
+ 1、ejqtde_main 获取各年份的期刊链接
+ 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
+ 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
+ *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
'''