Update the code these weeks
This commit is contained in:
parent
04806fa367
commit
d8addf5204
38
EJDE_spider/Transf.py
Normal file
38
EJDE_spider/Transf.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Function
|
||||||
|
# Get the data from input files
|
||||||
|
def Read(folder_path):
|
||||||
|
data = []
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data.extend(json.load(file))
|
||||||
|
return data
|
||||||
|
|
||||||
|
# Write into output files
|
||||||
|
def Write(data, output_file):
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(data, file, indent=4)
|
||||||
|
|
||||||
|
# Path of files need to be read
|
||||||
|
folder_path1 = '.\ejde_buffer\Author'
|
||||||
|
folder_path2 = '.\ejde_buffer\Article'
|
||||||
|
|
||||||
|
# Read the data in the files
|
||||||
|
Author_data = Read(folder_path1)
|
||||||
|
Article_data = Read(folder_path2)
|
||||||
|
|
||||||
|
# The path of output files
|
||||||
|
output_file1 = '.\ejde_buffer\Author_output_file.json'
|
||||||
|
output_file2 = '.\ejde_buffer\Article_output_file.json'
|
||||||
|
|
||||||
|
# Write into files
|
||||||
|
Write(Author_data, output_file1)
|
||||||
|
Write(Article_data, output_file2)
|
||||||
|
|
||||||
|
# End
|
||||||
|
print("\nData has been written into files.")
|
||||||
185
EJDE_spider/ejde_scrawler.py
Normal file
185
EJDE_spider/ejde_scrawler.py
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from retrying import retry
|
||||||
|
|
||||||
|
|
||||||
|
def save_data(dataset, filetype, filename):
|
||||||
|
if dataset:
|
||||||
|
directory = "./ejde_buffer/" + filetype + "/"
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, "w", encoding='utf-8') as json_file:
|
||||||
|
json.dump(dataset, json_file, indent=4)
|
||||||
|
print(filetype + " data have been added to", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||||
|
def process_article(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
baseWeb = url[:url.rfind('/')] + "/"
|
||||||
|
html = response.text
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
articles = soup.find_all("li")
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
authors = article.find("strong").text.strip().split(", ")
|
||||||
|
title = article.find("em").text.strip()
|
||||||
|
article_url = baseWeb + article.find("a")["href"]
|
||||||
|
|
||||||
|
# Access article detail page
|
||||||
|
response = requests.get(article_url)
|
||||||
|
html = response.text
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
article_text = soup.get_text()
|
||||||
|
|
||||||
|
# Extract volume
|
||||||
|
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||||
|
volume = volume_match.group(1) if volume_match else None
|
||||||
|
# year = volume_match.group(2) if volume_match else None
|
||||||
|
|
||||||
|
# Extract pp
|
||||||
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||||
|
pp = pp_match.group(1) if pp_match else None
|
||||||
|
|
||||||
|
# Extract issue
|
||||||
|
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||||
|
issue = issue_match.group(1) if issue_match else None
|
||||||
|
|
||||||
|
# Extract submission date
|
||||||
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
|
submitted_date = match.group(1) if match else None
|
||||||
|
|
||||||
|
# Extract publication date
|
||||||
|
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
|
publish_date = match.group(1) if match else None
|
||||||
|
|
||||||
|
# Extract MSC
|
||||||
|
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||||||
|
if not msc_match:
|
||||||
|
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
||||||
|
if msc_match:
|
||||||
|
msc = msc_match.group(1).strip().strip('.')
|
||||||
|
msc = re.split(r', |;', msc)
|
||||||
|
else:
|
||||||
|
msc = None
|
||||||
|
|
||||||
|
# Extract KeyWords
|
||||||
|
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||||||
|
if not keywords_match:
|
||||||
|
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
||||||
|
if keywords_match:
|
||||||
|
keywords = keywords_match.group(1).strip().replace('\n', '')
|
||||||
|
keywords = re.split(r', |;', keywords)
|
||||||
|
keywords = [keyword.strip().strip('.') for keyword in keywords]
|
||||||
|
else:
|
||||||
|
keywords = None
|
||||||
|
|
||||||
|
# Extract DOI
|
||||||
|
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
||||||
|
if not doi_match:
|
||||||
|
doi_match = re.search(r'DOI: (.+)', html)
|
||||||
|
doi = doi_match.group(1) if doi_match else None
|
||||||
|
|
||||||
|
# Article_id
|
||||||
|
article_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
article_data = {
|
||||||
|
"article_id": article_id,
|
||||||
|
"title": title,
|
||||||
|
"authors": authors,
|
||||||
|
"corresponding_authors": None,
|
||||||
|
"submit_datetime": submitted_date,
|
||||||
|
"publish_datetime": publish_date,
|
||||||
|
"keywords": keywords,
|
||||||
|
"MSC": msc,
|
||||||
|
"URL": article_url,
|
||||||
|
"DOI": doi,
|
||||||
|
"publisher": "Texas State University",
|
||||||
|
"journal": "Electronic Journal of Differential Equations",
|
||||||
|
"volume": volume,
|
||||||
|
"issue": issue,
|
||||||
|
"page": pp,
|
||||||
|
}
|
||||||
|
articleData.append(article_data)
|
||||||
|
|
||||||
|
# Author info
|
||||||
|
table = soup.find('table')
|
||||||
|
for row in table.find_all('tr'):
|
||||||
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
|
for cell in cells:
|
||||||
|
cell = cell.split("\n")
|
||||||
|
cell = [element.replace('email: ', '') for element in cell]
|
||||||
|
cell = [c.strip() for c in cell]
|
||||||
|
|
||||||
|
# Data processing
|
||||||
|
name = cell[0].split(" ")
|
||||||
|
affiliation = ', '.join(cell[1:-1])
|
||||||
|
email = cell[-1]
|
||||||
|
|
||||||
|
author_data = {
|
||||||
|
"author_id": str(uuid.uuid4()),
|
||||||
|
"from_article": article_id,
|
||||||
|
"first_name": name[0],
|
||||||
|
"last_name": name[-1],
|
||||||
|
"middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
|
||||||
|
"affiliation": [{
|
||||||
|
"year": volume,
|
||||||
|
"affiliation": affiliation,
|
||||||
|
"email": email,
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
authorData.append(author_data)
|
||||||
|
|
||||||
|
# Save the data periodically based on batch size
|
||||||
|
if len(articleData) % batch_size == 0:
|
||||||
|
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
||||||
|
articleData.clear()
|
||||||
|
|
||||||
|
if len(authorData) % batch_size == 0:
|
||||||
|
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
||||||
|
authorData.clear()
|
||||||
|
|
||||||
|
|
||||||
|
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||||||
|
response = requests.get(index)
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
# Find all the URL links under the first (Volumes) section
|
||||||
|
volume_links = soup.select('font > a[href]')
|
||||||
|
|
||||||
|
# Extract and store the URLs in a list using list comprehension
|
||||||
|
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
|
||||||
|
|
||||||
|
authorData = []
|
||||||
|
articleData = []
|
||||||
|
|
||||||
|
batch_size = 500 # Number of articles to process before saving
|
||||||
|
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
||||||
|
|
||||||
|
# Process each URL using multithreading
|
||||||
|
futures = [executor.submit(process_article, url) for url in url_list]
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for future in as_completed(futures):
|
||||||
|
try:
|
||||||
|
future.result()
|
||||||
|
except Exception as e:
|
||||||
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
|
# Save remaining data
|
||||||
|
if articleData:
|
||||||
|
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
||||||
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
|
||||||
|
|
||||||
|
if authorData:
|
||||||
|
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
||||||
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
|
||||||
148
SpringerOpen_spider/SD_detail.py
Normal file
148
SpringerOpen_spider/SD_detail.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
import uuid
|
||||||
|
|
||||||
|
# ==========获取细节==========
|
||||||
|
def Author_dict(soup, article_id, Author_list):
|
||||||
|
info = soup.find('article', lang='en')
|
||||||
|
author_info = info.find('div', id='author-information-content')
|
||||||
|
article_info = info.find('div', class_='c-article-header')
|
||||||
|
|
||||||
|
# Author
|
||||||
|
authors = article_info.find('ul', class_='c-article-author-list')
|
||||||
|
authors = authors.find_all('li', class_='c-article-author-list__item')
|
||||||
|
for author in authors:
|
||||||
|
# Name
|
||||||
|
author = author.find('a').get_text()
|
||||||
|
author = author.split(' ')
|
||||||
|
author = [char.replace('-', '') for char in author]
|
||||||
|
|
||||||
|
Firstname = author[0]
|
||||||
|
Lastname = author[-1]
|
||||||
|
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
||||||
|
|
||||||
|
# Year
|
||||||
|
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
|
||||||
|
|
||||||
|
# Affiliation
|
||||||
|
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
|
||||||
|
|
||||||
|
# Email
|
||||||
|
Email = None # Can not reach the data
|
||||||
|
|
||||||
|
# Input into dict
|
||||||
|
author_data = {
|
||||||
|
"author_id": str(uuid.uuid4()),
|
||||||
|
"from_article": article_id,
|
||||||
|
"firstname": Firstname,
|
||||||
|
"lastname": Lastname,
|
||||||
|
"middlename": Middlename,
|
||||||
|
"affiliation": [
|
||||||
|
{
|
||||||
|
"year": Year,
|
||||||
|
"affiliation": Affiliation,
|
||||||
|
"email": Email
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Author_list.append(author_data)
|
||||||
|
|
||||||
|
return Author_list
|
||||||
|
|
||||||
|
|
||||||
|
def Article_dict(soup, url, article_id):
|
||||||
|
info = soup.find('article', lang='en')
|
||||||
|
article_info = info.find('div', class_='c-article-header')
|
||||||
|
|
||||||
|
# Title
|
||||||
|
Title = article_info.find('h1').get_text()
|
||||||
|
|
||||||
|
# Author
|
||||||
|
Author = [] # A new empty list
|
||||||
|
|
||||||
|
author_list = article_info.find('ul', class_='c-article-author-list')
|
||||||
|
authors = author_list.find_all('li', class_='c-article-author-list__item')
|
||||||
|
for author in authors:
|
||||||
|
author = author.find('a').get_text()
|
||||||
|
author = [char.replace('-', '') for char in author]
|
||||||
|
author = ''.join(author)
|
||||||
|
Author.append(author)
|
||||||
|
|
||||||
|
# Corresponding_author
|
||||||
|
Corresponding_author = [] # A new empty list
|
||||||
|
|
||||||
|
corresponding_author_list = info.find('p', id='corresponding-author-list')
|
||||||
|
corresponding_authors = corresponding_author_list.find_all('a')
|
||||||
|
if Corresponding_author is not None:
|
||||||
|
for corresponding_author in corresponding_authors:
|
||||||
|
corresponding_author = corresponding_author.get_text()
|
||||||
|
corresponding_author = [char.replace('-', '') for char in corresponding_author]
|
||||||
|
corresponding_author = ''.join(corresponding_author)
|
||||||
|
Corresponding_author.append(corresponding_author)
|
||||||
|
|
||||||
|
# Submitted_datetime & Published_datetime
|
||||||
|
Time = []
|
||||||
|
|
||||||
|
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
||||||
|
times = time_list.find_all('time')
|
||||||
|
for time in times:
|
||||||
|
time = time.get_text()
|
||||||
|
Time.append(time)
|
||||||
|
|
||||||
|
Submitted_date = Time[0]
|
||||||
|
Publish_date = Time[-1]
|
||||||
|
|
||||||
|
# keyword
|
||||||
|
Keyword = [] # A new empty list
|
||||||
|
|
||||||
|
keyword_list = info.find('ul', class_='c-article-subject-list')
|
||||||
|
if keyword_list is not None:
|
||||||
|
keywords = keyword_list.find_all('li')
|
||||||
|
for keyword in keywords:
|
||||||
|
keyword = keyword.get_text()
|
||||||
|
Keyword.append(keyword)
|
||||||
|
|
||||||
|
# MSC
|
||||||
|
MSC = None # SpringerOpen.com does not have MSC
|
||||||
|
|
||||||
|
# DOI
|
||||||
|
DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
|
||||||
|
if DOI is not None:
|
||||||
|
DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text()
|
||||||
|
|
||||||
|
# Publisher
|
||||||
|
Publisher = 'springeropen.com'
|
||||||
|
|
||||||
|
# Journal
|
||||||
|
Journal = info.find('p', class_='c-article-info-details')
|
||||||
|
Journal = Journal.find('i').get_text()
|
||||||
|
|
||||||
|
# Volume
|
||||||
|
Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
|
||||||
|
|
||||||
|
# Issue
|
||||||
|
Issue = info.find('p', class_='c-article-info-details')
|
||||||
|
Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text()
|
||||||
|
|
||||||
|
# Page
|
||||||
|
Page = None
|
||||||
|
|
||||||
|
# Input into dict
|
||||||
|
article_data = {
|
||||||
|
"article_id": article_id,
|
||||||
|
"title": Title,
|
||||||
|
"authors": Author,
|
||||||
|
"corresponding_authors": Corresponding_author,
|
||||||
|
"submit_datetime": Submitted_date,
|
||||||
|
"publish_datetime": Publish_date,
|
||||||
|
"keywords": Keyword,
|
||||||
|
"MSC": MSC,
|
||||||
|
"URL": url,
|
||||||
|
"DOI": DOI,
|
||||||
|
"publisher": Publisher,
|
||||||
|
"journal": Journal,
|
||||||
|
"volume": Volume,
|
||||||
|
"issue": Issue,
|
||||||
|
"page": Page,
|
||||||
|
}
|
||||||
|
|
||||||
|
return article_data
|
||||||
25
SpringerOpen_spider/SD_header.py
Normal file
25
SpringerOpen_spider/SD_header.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import random
|
||||||
|
|
||||||
|
# 用户代理地址池
|
||||||
|
uapools=[
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
|
||||||
|
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
||||||
|
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
||||||
|
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
||||||
|
]
|
||||||
|
|
||||||
|
def header():
|
||||||
|
# 网站请求头
|
||||||
|
headers = {
|
||||||
|
'User-Agent': random.choice(uapools),
|
||||||
|
}
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
15
SpringerOpen_spider/SD_link.py
Normal file
15
SpringerOpen_spider/SD_link.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# 标准访问格式
|
||||||
|
def Link(url, headers):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status() # 检查请求是否成功
|
||||||
|
html = response.text
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
return soup
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print("Wrong:", e)
|
||||||
|
return None
|
||||||
75
SpringerOpen_spider/SD_main.py
Normal file
75
SpringerOpen_spider/SD_main.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import urllib
|
||||||
|
|
||||||
|
import SD_header
|
||||||
|
import SD_link
|
||||||
|
import SD_threads
|
||||||
|
import SD_save
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
'''
|
||||||
|
爬取网站:https://www.springeropen.com
|
||||||
|
|
||||||
|
==========运行顺序==========
|
||||||
|
1、SD_main 获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接
|
||||||
|
2、SD_threads 多线程管控 -> 调用SD_scrawl
|
||||||
|
3、SD_scrawl 获取论文详情页链接 -> 调用SD_detail
|
||||||
|
4、SD_detail 获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件(json)暂存
|
||||||
|
5、SD_main 调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||||
|
*6、SD_save(可选) 删除暂存区内部所有文件(注意备份)
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# 存放网页链接的空列表
|
||||||
|
Links = [] # A list for links
|
||||||
|
Webs = [] # A list for web url
|
||||||
|
|
||||||
|
# 存放爬取数据的空列表
|
||||||
|
Article_data = []
|
||||||
|
Author_data = []
|
||||||
|
|
||||||
|
# ==========访问论文列表页==========
|
||||||
|
# 获取数学类期刊网站链接
|
||||||
|
headers = SD_header.header()
|
||||||
|
soup = SD_link.Link('https://www.springeropen.com/journals', headers)
|
||||||
|
|
||||||
|
hrefs = soup.find('ol', id='Mathematics-list')
|
||||||
|
hrefs = hrefs.find_all('a')
|
||||||
|
for href in hrefs:
|
||||||
|
href = 'http:' + href.get('href') + '/articles'
|
||||||
|
sub_soup = SD_link.Link(href, headers)
|
||||||
|
|
||||||
|
# 获取当前期刊的文章列表页数
|
||||||
|
pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text()
|
||||||
|
pp = pp.split(' ')[-1]
|
||||||
|
|
||||||
|
# 修饰链接
|
||||||
|
url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=')
|
||||||
|
|
||||||
|
# 存入字典和列表
|
||||||
|
web = {
|
||||||
|
"url": url,
|
||||||
|
"page": int(pp)
|
||||||
|
}
|
||||||
|
Webs.append(web)
|
||||||
|
|
||||||
|
# 处理链接后存入待处理的链接列表
|
||||||
|
for web in Webs:
|
||||||
|
for page in range(1, web['page']+1):
|
||||||
|
link = web['url'] + str(page)
|
||||||
|
Links.append(link)
|
||||||
|
|
||||||
|
print('\nThe links have been stored!\n')
|
||||||
|
|
||||||
|
# 进入多线程池开始爬取
|
||||||
|
SD_threads.Threads(Links, Article_data, Author_data)
|
||||||
|
|
||||||
|
# json文件汇总
|
||||||
|
SD_save.Transf()
|
||||||
|
|
||||||
|
# # ==========删除所有暂存的小文件(可选,注意备份)===========
|
||||||
|
# SD_save.delete('./SpringerOpen_buffer/Article_TS/')
|
||||||
|
# SD_save.delete('./SpringerOpen_buffer/Author_TS/')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
85
SpringerOpen_spider/SD_save.py
Normal file
85
SpringerOpen_spider/SD_save.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# 将数据暂存到json文件(小文件 50篇)
|
||||||
|
def save_data(dataset, filetype, filename):
|
||||||
|
if dataset:
|
||||||
|
directory = "./SpringerOpen_buffer/" + filetype + "/"
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, "w", encoding='utf-8') as json_file:
|
||||||
|
json.dump(dataset, json_file, indent=4)
|
||||||
|
print(filetype + " data have been added to", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
# 文件最终筛选汇总
|
||||||
|
def Transf():
|
||||||
|
def Read(folder_path, output_files):
|
||||||
|
# 新建文件夹
|
||||||
|
os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
|
||||||
|
os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
# 筛选文章
|
||||||
|
data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
|
|
||||||
|
data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||||
|
|
||||||
|
data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||||
|
|
||||||
|
data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||||
|
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
|
# 转存
|
||||||
|
for index in range(0, 4):
|
||||||
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
|
# 读取路径
|
||||||
|
|
||||||
|
author_folder_path = './SpringerOpen_buffer/Author'
|
||||||
|
article_folder_path = './SpringerOpen_buffer/Article'
|
||||||
|
|
||||||
|
# 存储路径
|
||||||
|
author_output_file = [
|
||||||
|
'./SpringerOpen_buffer/Author_output/Author_output_file(oldest).json',
|
||||||
|
'./SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json',
|
||||||
|
'./SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json',
|
||||||
|
'./SpringerOpen_buffer/Author_output/Author_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
article_output_file = [
|
||||||
|
'./SpringerOpen_buffer/Article_output/Article_output_file(oldest).json',
|
||||||
|
'./SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json',
|
||||||
|
'./SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json',
|
||||||
|
'./SpringerOpen_buffer/Article_output/Article_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
# 读取并写入文件
|
||||||
|
Read(author_folder_path, author_output_file)
|
||||||
|
Read(article_folder_path, article_output_file)
|
||||||
|
|
||||||
|
# End
|
||||||
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
# 删除暂存区文件
|
||||||
|
def delete(folder_path):
|
||||||
|
file_names = os.listdir(folder_path)
|
||||||
|
|
||||||
|
for file_name in file_names:
|
||||||
|
file_path = os.path.join(folder_path, file_name)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
48
SpringerOpen_spider/SD_scrawl.py
Normal file
48
SpringerOpen_spider/SD_scrawl.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import time
|
||||||
|
import urllib
|
||||||
|
import uuid
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import SD_header
|
||||||
|
import SD_link
|
||||||
|
import SD_detail
|
||||||
|
import SD_save
|
||||||
|
|
||||||
|
# ==========获取论文详情页链接==========
|
||||||
|
def Scrawl(Link, Article_data, Author_data):
|
||||||
|
# 访问论文列表页
|
||||||
|
headers = SD_header.header()
|
||||||
|
soup = SD_link.Link(Link, headers)
|
||||||
|
print(Link)
|
||||||
|
|
||||||
|
# 获得所有论文详情页的链接
|
||||||
|
Essay_Ol = soup.find('ol') # 获取论文列表
|
||||||
|
Essay_Li = Essay_Ol.find_all('li') # 获取全部论文详情页链接
|
||||||
|
|
||||||
|
# 爬取一页论文列表中所有的论文(50篇)
|
||||||
|
for Essay_hrefs in Essay_Li:
|
||||||
|
Essay_href = Essay_hrefs.find('a', itemprop='url')
|
||||||
|
if Essay_href is not None:
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
sub_Link = Essay_href.get('href') # 获取链接
|
||||||
|
sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link)
|
||||||
|
|
||||||
|
# ==========访问论文详情页==========
|
||||||
|
sub_soup = SD_link.Link(sub_Link, headers) # 获取详情
|
||||||
|
article_id = str(uuid.uuid4()) # 标号
|
||||||
|
|
||||||
|
# 获取细节并且添加进对应列表
|
||||||
|
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
|
||||||
|
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
|
||||||
|
|
||||||
|
# 放入json文件暂存(小文件)
|
||||||
|
if Article_data:
|
||||||
|
index = str(uuid.uuid4())
|
||||||
|
SD_save.save_data(Article_data, "Article_TS", index + ".json")
|
||||||
|
print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/')
|
||||||
|
|
||||||
|
if Author_data:
|
||||||
|
index = str(uuid.uuid4())
|
||||||
|
SD_save.save_data(Author_data, "Author_TS", index + ".json")
|
||||||
|
print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/')
|
||||||
25
SpringerOpen_spider/SD_threads.py
Normal file
25
SpringerOpen_spider/SD_threads.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||||
|
import SD_scrawl
|
||||||
|
|
||||||
|
# ==========多线程处理==========
|
||||||
|
def Threads(Links, Article_data, Author_data):
|
||||||
|
executor = ThreadPoolExecutor(max_workers=20) # 进程池
|
||||||
|
|
||||||
|
# 进行多线程处理
|
||||||
|
futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
|
||||||
|
|
||||||
|
# max_iterations = 5 # 最多同时进行数
|
||||||
|
# iteration_count = 0 # 计数器
|
||||||
|
|
||||||
|
# 等待所有进程完成
|
||||||
|
for future in as_completed(futures):
|
||||||
|
try:
|
||||||
|
future.result()
|
||||||
|
# # 限制最大同时爬取数
|
||||||
|
# iteration_count += 1 # Increment the counter
|
||||||
|
# if iteration_count >= max_iterations:
|
||||||
|
# break
|
||||||
|
except Exception as e:
|
||||||
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
|
wait(futures)
|
||||||
Loading…
x
Reference in New Issue
Block a user