Merge branch 'main' of https://git.ecwuuuuu.com/datamining/CST_scrawlCode
This commit is contained in:
commit
ee0f956645
@ -1,38 +0,0 @@
|
|||||||
import os
|
|
||||||
import json
|
|
||||||
|
|
||||||
# Function
|
|
||||||
# Get the data from input files
|
|
||||||
def Read(folder_path):
|
|
||||||
data = []
|
|
||||||
|
|
||||||
for filename in os.listdir(folder_path):
|
|
||||||
if filename.endswith('.json'):
|
|
||||||
file_path = os.path.join(folder_path, filename)
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
|
||||||
data.extend(json.load(file))
|
|
||||||
return data
|
|
||||||
|
|
||||||
# Write into output files
|
|
||||||
def Write(data, output_file):
|
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
|
||||||
json.dump(data, file, indent=4)
|
|
||||||
|
|
||||||
# Path of files need to be read
|
|
||||||
folder_path1 = '.\ejde_buffer\Author'
|
|
||||||
folder_path2 = '.\ejde_buffer\Article'
|
|
||||||
|
|
||||||
# Read the data in the files
|
|
||||||
Author_data = Read(folder_path1)
|
|
||||||
Article_data = Read(folder_path2)
|
|
||||||
|
|
||||||
# The path of output files
|
|
||||||
output_file1 = '.\ejde_buffer\Author_output_file.json'
|
|
||||||
output_file2 = '.\ejde_buffer\Article_output_file.json'
|
|
||||||
|
|
||||||
# Write into files
|
|
||||||
Write(Author_data, output_file1)
|
|
||||||
Write(Article_data, output_file2)
|
|
||||||
|
|
||||||
# End
|
|
||||||
print("\nData has been written into files.")
|
|
||||||
@ -1,23 +1,22 @@
|
|||||||
import os
|
|
||||||
import uuid
|
import uuid
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
import re
|
||||||
import json
|
import ejde_save
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
'''
|
||||||
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
def save_data(dataset, filetype, filename):
|
==========运行顺序==========
|
||||||
if dataset:
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
directory = "./ejde_buffer/" + filetype + "/"
|
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||||
os.makedirs(directory, exist_ok=True)
|
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||||
filepath = os.path.join(directory, filename)
|
'''
|
||||||
with open(filepath, "w", encoding='utf-8') as json_file:
|
|
||||||
json.dump(dataset, json_file, indent=4)
|
|
||||||
print(filetype + " data have been added to", filepath)
|
|
||||||
|
|
||||||
|
|
||||||
|
# Article and author detail
|
||||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||||
def process_article(url):
|
def process_article(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
@ -43,7 +42,7 @@ def process_article(url):
|
|||||||
|
|
||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||||
volume = volume_match.group(1) if volume_match else None
|
volume = str(volume_match.group(1)) if volume_match else None
|
||||||
# year = volume_match.group(2) if volume_match else None
|
# year = volume_match.group(2) if volume_match else None
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
@ -141,11 +140,11 @@ def process_article(url):
|
|||||||
|
|
||||||
# Save the data periodically based on batch size
|
# Save the data periodically based on batch size
|
||||||
if len(articleData) % batch_size == 0:
|
if len(articleData) % batch_size == 0:
|
||||||
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
articleData.clear()
|
articleData.clear()
|
||||||
|
|
||||||
if len(authorData) % batch_size == 0:
|
if len(authorData) % batch_size == 0:
|
||||||
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
authorData.clear()
|
authorData.clear()
|
||||||
|
|
||||||
|
|
||||||
@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
|
|||||||
authorData = []
|
authorData = []
|
||||||
articleData = []
|
articleData = []
|
||||||
|
|
||||||
batch_size = 500 # Number of articles to process before saving
|
batch_size = 5 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
@ -176,10 +175,14 @@ for future in as_completed(futures):
|
|||||||
print("An error occurred:", str(e))
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
# Save remaining data
|
# Save remaining data
|
||||||
if articleData:
|
if len(articleData) > 0:
|
||||||
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||||
|
|
||||||
if authorData:
|
if len(authorData) > 0:
|
||||||
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||||
|
|
||||||
|
# Transfer to large file and delete the temporary storage files
|
||||||
|
ejde_save.Transf()
|
||||||
|
ejde_save.delete()
|
||||||
93
EJDE_spider/ejde_save.py
Normal file
93
EJDE_spider/ejde_save.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# Save data
|
||||||
|
def save_data(dataset, filetype, filename):
|
||||||
|
if dataset:
|
||||||
|
directory = "./ejde_buffer/" + filetype + "/"
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, "w", encoding='utf-8') as json_file:
|
||||||
|
json.dump(dataset, json_file, indent=4)
|
||||||
|
print(filetype + " data have been added to", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
# Write into output files
|
||||||
|
def Transf():
|
||||||
|
def Read(folder_path, output_files):
|
||||||
|
# Create new folders
|
||||||
|
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
||||||
|
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
data_oldest = []
|
||||||
|
data_2010_2014 = []
|
||||||
|
data_2015_2020 = []
|
||||||
|
data_newest = []
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
|
# Select data
|
||||||
|
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
|
|
||||||
|
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||||
|
|
||||||
|
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||||
|
|
||||||
|
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||||
|
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
|
# Transfer
|
||||||
|
for index in range(0, 4):
|
||||||
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
|
# The path of reading
|
||||||
|
author_folder_path = './ejde_buffer/Author_TS'
|
||||||
|
article_folder_path = './ejde_buffer/Article_TS'
|
||||||
|
|
||||||
|
# The path of storage
|
||||||
|
author_output_file = [
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(oldest).json',
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(2010-2014).json',
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(2015-2020).json',
|
||||||
|
'./ejde_buffer/Author_output/Author_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
article_output_file = [
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(oldest).json',
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(2010-2014).json',
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(2015-2020).json',
|
||||||
|
'./ejde_buffer/Article_output/Article_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Read and write into files
|
||||||
|
Read(author_folder_path, author_output_file)
|
||||||
|
Read(article_folder_path, article_output_file)
|
||||||
|
|
||||||
|
# End
|
||||||
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
|
||||||
|
# Delete files in temporary storage area
|
||||||
|
def delete():
|
||||||
|
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
||||||
|
for folder_path in folder_paths:
|
||||||
|
file_names = os.listdir(folder_path)
|
||||||
|
for file_name in file_names:
|
||||||
|
file_path = os.path.join(folder_path, file_name)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
113
EJQTDE_spider/ejqtde_main.py
Normal file
113
EJQTDE_spider/ejqtde_main.py
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import threading
|
||||||
|
import urllib
|
||||||
|
import ejqtde_scrawler
|
||||||
|
import ejqtde_save
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from selenium.webdriver.edge.options import Options
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
'''
|
||||||
|
爬取网站:'https://www.math.u-szeged.hu/ejqtde'
|
||||||
|
|
||||||
|
==========运行顺序==========
|
||||||
|
1、ejqtde_main 获取各年份的期刊链接
|
||||||
|
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
|
||||||
|
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||||||
|
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# Multithread pool
|
||||||
|
def extract_href(link):
|
||||||
|
driver = webdriver.Edge(options=options)
|
||||||
|
driver.get(link)
|
||||||
|
html_code = driver.page_source
|
||||||
|
soup = BeautifulSoup(html_code, 'html.parser')
|
||||||
|
column_right = soup.find('div', id='columnRight')
|
||||||
|
if column_right:
|
||||||
|
ordered_lists = column_right.find_all('ol')
|
||||||
|
for idx, ordered_list in enumerate(ordered_lists, 1):
|
||||||
|
for list_item in ordered_list.find_all('li'):
|
||||||
|
matches = re.findall(r'</a>: <a\s+href="(periodica\.html\?periodica=1&'
|
||||||
|
r'paramtipus_ertek=publication&param_ertek=\d+)"', str(list_item))
|
||||||
|
for match in matches:
|
||||||
|
URL = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', match)
|
||||||
|
hrefs.append(URL)
|
||||||
|
|
||||||
|
print('Links got: ', link)
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
|
||||||
|
# Empty list
|
||||||
|
Author_list = []
|
||||||
|
Article_list = []
|
||||||
|
hrefs = []
|
||||||
|
|
||||||
|
# Base web urls
|
||||||
|
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
|
||||||
|
current_year = datetime.datetime.now().year
|
||||||
|
years = range(2009, 2011) # years = range(2010, current_year + 1)
|
||||||
|
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1¶mtipus_ertek=publications¶m_ertek='
|
||||||
|
+ f'{year}' for year in years][::-1]
|
||||||
|
|
||||||
|
# Options setting
|
||||||
|
options = Options()
|
||||||
|
options.add_argument('--headless') # Run Edge in headless mode
|
||||||
|
options.add_argument('disable-gpu') # Disable GPU acceleration
|
||||||
|
options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'
|
||||||
|
|
||||||
|
# Use multithreading to process URLs concurrently
|
||||||
|
with ThreadPoolExecutor(max_workers=25) as executor:
|
||||||
|
futures = [executor.submit(extract_href, url) for url in url_list]
|
||||||
|
for future in as_completed(futures):
|
||||||
|
pass
|
||||||
|
|
||||||
|
wait(futures)
|
||||||
|
print('\nAll links have been got.\n')
|
||||||
|
|
||||||
|
# Use multithreading to get the data
|
||||||
|
count1 = 0
|
||||||
|
count2 = 0
|
||||||
|
locks = threading.Lock()
|
||||||
|
scrawl_lock = threading.Lock()
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=25) as executor:
|
||||||
|
futures = [executor.submit(ejqtde_scrawler.scrawler, href, scrawl_lock, Article_list, Author_list) for href in hrefs]
|
||||||
|
for future in as_completed(futures):
|
||||||
|
if len(Article_list) >= 50:
|
||||||
|
with locks:
|
||||||
|
count1 += len(Article_list)
|
||||||
|
ejqtde_save.save_data(Article_list, "Article_TS")
|
||||||
|
Article_list.clear()
|
||||||
|
|
||||||
|
if len(Author_list) >= 50:
|
||||||
|
with locks:
|
||||||
|
count2 += len(Author_list)
|
||||||
|
ejqtde_save.save_data(Author_list, "Author_TS")
|
||||||
|
Author_list.clear()
|
||||||
|
wait(futures)
|
||||||
|
|
||||||
|
# Deal with the remaining data
|
||||||
|
if len(Article_list) > 0:
|
||||||
|
count1 += len(Article_list)
|
||||||
|
ejqtde_save.save_data(Article_list, "Article_TS")
|
||||||
|
Article_list.clear()
|
||||||
|
print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
|
||||||
|
if len(Author_list) > 0:
|
||||||
|
count2 += len(Author_list)
|
||||||
|
ejqtde_save.save_data(Author_list, "Author_TS")
|
||||||
|
Author_list.clear()
|
||||||
|
print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')
|
||||||
|
|
||||||
|
print('\nThe whole scrawler program has been done\n')
|
||||||
|
print(count1, ' article_data has been stored.')
|
||||||
|
print(count2, ' author_data has been stored.')
|
||||||
|
|
||||||
|
# Transfer to large file and delete the temporary storage files
|
||||||
|
ejqtde_save.Transf()
|
||||||
|
ejqtde_save.delete()
|
||||||
96
EJQTDE_spider/ejqtde_save.py
Normal file
96
EJQTDE_spider/ejqtde_save.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
|
# Save into files
|
||||||
|
def save_data(dataset, filetype):
|
||||||
|
if dataset:
|
||||||
|
filename = str(uuid.uuid4()) + ".json"
|
||||||
|
directory = "./EJQTDE_buffer/" + filetype + "/"
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, "w", encoding='utf-8') as json_file:
|
||||||
|
json.dump(dataset, json_file, indent=4)
|
||||||
|
print(filetype + " data have been added to", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
# Summary files
|
||||||
|
def Transf():
|
||||||
|
def Read(folder_path, output_files):
|
||||||
|
# Create new folder
|
||||||
|
os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
|
||||||
|
os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
data_oldest = []
|
||||||
|
data_2010_2014 = []
|
||||||
|
data_2015_2020 = []
|
||||||
|
data_newest = []
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
|
# Select data
|
||||||
|
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
|
|
||||||
|
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||||
|
|
||||||
|
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||||
|
|
||||||
|
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||||
|
|
||||||
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|
||||||
|
# Transfer
|
||||||
|
for index in range(0, 4):
|
||||||
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
|
# The path of reading
|
||||||
|
author_folder_path = './EJQTDE_buffer/Author_TS'
|
||||||
|
article_folder_path = './EJQTDE_buffer/Article_TS'
|
||||||
|
|
||||||
|
# The path of storage
|
||||||
|
author_output_file = [
|
||||||
|
'./EJQTDE_buffer/Author_output/Author_output_file(oldest).json',
|
||||||
|
'./EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json',
|
||||||
|
'./EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json',
|
||||||
|
'./EJQTDE_buffer/Author_output/Author_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
article_output_file = [
|
||||||
|
'./EJQTDE_buffer/Article_output/Article_output_file(oldest).json',
|
||||||
|
'./EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json',
|
||||||
|
'./EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json',
|
||||||
|
'./EJQTDE_buffer/Article_output/Article_output_file(newest).json'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Read and write into files
|
||||||
|
Read(author_folder_path, author_output_file)
|
||||||
|
Read(article_folder_path, article_output_file)
|
||||||
|
|
||||||
|
# End
|
||||||
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
|
||||||
|
# Delete files in temporary storage area
|
||||||
|
def delete():
|
||||||
|
folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS']
|
||||||
|
for folder_path in folder_paths:
|
||||||
|
file_names = os.listdir(folder_path)
|
||||||
|
for file_name in file_names:
|
||||||
|
file_path = os.path.join(folder_path, file_name)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
|
|
||||||
187
EJQTDE_spider/ejqtde_scrawler.py
Normal file
187
EJQTDE_spider/ejqtde_scrawler.py
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
from selenium.webdriver.edge.options import Options
|
||||||
|
from selenium import webdriver
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
|
||||||
|
# Get the information in the webpage through selenium
|
||||||
|
def source(driver, num):
|
||||||
|
if driver.find_elements(by='id', value='columnRight'):
|
||||||
|
html_code = driver.page_source
|
||||||
|
soup = BeautifulSoup(html_code, 'html.parser')
|
||||||
|
return soup
|
||||||
|
elif num == 5:
|
||||||
|
print('Out of times!')
|
||||||
|
driver.quit()
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
num += 1
|
||||||
|
time.sleep(3)
|
||||||
|
return source(driver, num)
|
||||||
|
|
||||||
|
|
||||||
|
# Get the links of the authors' information
|
||||||
|
def author_links(Data):
|
||||||
|
Author_links = []
|
||||||
|
Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&'
|
||||||
|
'paramtipus_ertek=person_data&param_ertek=\d+')
|
||||||
|
Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
|
||||||
|
for Author_href in Author_hrefs:
|
||||||
|
Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
|
||||||
|
Author_links.append(Author_href)
|
||||||
|
|
||||||
|
return Author_links
|
||||||
|
|
||||||
|
|
||||||
|
# Get the information of the authors
|
||||||
|
def author_detail(Data, Year, article_id, Author_list):
|
||||||
|
# Name
|
||||||
|
author = Data.find('p', class_='publication_head').get_text()
|
||||||
|
|
||||||
|
author = author.split(',')
|
||||||
|
author = [char.replace(' ', '') for char in author]
|
||||||
|
|
||||||
|
Firstname = author[0]
|
||||||
|
Lastname = author[-1]
|
||||||
|
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
||||||
|
|
||||||
|
# infor
|
||||||
|
table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
|
||||||
|
Td = table.find_all('td')
|
||||||
|
line = [td for td in Td]
|
||||||
|
|
||||||
|
# Affiliation
|
||||||
|
Affiliation = line[1].get_text()
|
||||||
|
|
||||||
|
# Email
|
||||||
|
Email = line[0].find('a').get('href')
|
||||||
|
|
||||||
|
author_data = {
|
||||||
|
"author_id": str(uuid.uuid4()),
|
||||||
|
"from_article": article_id,
|
||||||
|
"firstname": Firstname,
|
||||||
|
"lastname": Lastname,
|
||||||
|
"middlename": Middlename,
|
||||||
|
"affiliation": [
|
||||||
|
{
|
||||||
|
"year": Year,
|
||||||
|
"affiliation": Affiliation,
|
||||||
|
"email": Email
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Author_list.append(author_data)
|
||||||
|
return Author_list
|
||||||
|
|
||||||
|
|
||||||
|
# Get the article's information
|
||||||
|
def article_detail(Data, URL, article_id, Aricle_list):
|
||||||
|
# Title
|
||||||
|
font = Data.find('font', attrs={'size': '+1'})
|
||||||
|
Title = font.find('b').get_text()
|
||||||
|
|
||||||
|
# Author and Corresponding_authors
|
||||||
|
author_pattern = re.compile(r'periodica\.html\?periodica=1&'
|
||||||
|
r'paramtipus_ertek=person_data&param_ertek=\d+"><b>(.*?)</b>')
|
||||||
|
Author = re.findall(author_pattern, str(Data))
|
||||||
|
Corresponding_author = Author[-1] # Corresponding_authors
|
||||||
|
del Author[-1]
|
||||||
|
|
||||||
|
# Submit_datetime and publish_datetime
|
||||||
|
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
||||||
|
time = re.findall(r'\d+-\d+-\d+', str(time))
|
||||||
|
Submit_date = time[0] if time[0] else None
|
||||||
|
Publish_date = time[1] if time[1] else None
|
||||||
|
|
||||||
|
# Keyword
|
||||||
|
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
||||||
|
Keyword = Keyword.split(', ') if Keyword is not None else None
|
||||||
|
|
||||||
|
# MSC
|
||||||
|
MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
|
||||||
|
MSC = MSC.split(', ') if MSC is not None else None
|
||||||
|
|
||||||
|
# DOI
|
||||||
|
if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
|
||||||
|
DOI = re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))[0]
|
||||||
|
else:
|
||||||
|
DOI = None
|
||||||
|
|
||||||
|
# Publisher
|
||||||
|
Publisher = 'www.math.u-szeged.hu/ejqtde'
|
||||||
|
|
||||||
|
# Journal
|
||||||
|
Journal = 'Electronic Journal of Qualitative Theory of Differential Equations'
|
||||||
|
|
||||||
|
# Volume
|
||||||
|
Volume = re.findall(r'<b>(\d+)</b>', str(Data))[0]
|
||||||
|
|
||||||
|
# Issue and page
|
||||||
|
result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
|
||||||
|
Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
|
||||||
|
Page = re.findall(r'\d+, (\d+-\d+)', result)[0]
|
||||||
|
|
||||||
|
article_data = {
|
||||||
|
"article_id": article_id,
|
||||||
|
"title": Title,
|
||||||
|
"authors": Author,
|
||||||
|
"corresponding_authors": Corresponding_author,
|
||||||
|
"submit_datetime": Submit_date,
|
||||||
|
"publish_datetime": Publish_date,
|
||||||
|
"keywords": Keyword,
|
||||||
|
"MSC": MSC,
|
||||||
|
"URL": URL,
|
||||||
|
"DOI": DOI,
|
||||||
|
"publisher": Publisher,
|
||||||
|
"journal": Journal,
|
||||||
|
"volume": Volume,
|
||||||
|
"issue": Issue,
|
||||||
|
"page": Page,
|
||||||
|
}
|
||||||
|
|
||||||
|
Aricle_list.append(article_data)
|
||||||
|
return Aricle_list
|
||||||
|
|
||||||
|
|
||||||
|
# Main code of scrawler
|
||||||
|
def scrawler(URL, lock, Article_list, Author_list):
|
||||||
|
print('Start: ', URL)
|
||||||
|
driver = webdriver.Edge(options=options)
|
||||||
|
driver.get(URL)
|
||||||
|
|
||||||
|
# Enter the detail page
|
||||||
|
Max_retryTimes = 3
|
||||||
|
Essay_data = source(driver, Max_retryTimes)
|
||||||
|
if Essay_data is not None:
|
||||||
|
article_id = str(uuid.uuid4())
|
||||||
|
Article_list = article_detail(Essay_data, URL, article_id, Article_list)
|
||||||
|
|
||||||
|
# Get the authors' information
|
||||||
|
Year = re.findall(r'<b>(\d+)</b>', str(Essay_data))[0]
|
||||||
|
for author_link in author_links(Essay_data):
|
||||||
|
driver.get(author_link)
|
||||||
|
Author_detail = source(driver, Max_retryTimes)
|
||||||
|
Author_list = author_detail(Author_detail, Year, article_id, Author_list)
|
||||||
|
|
||||||
|
|
||||||
|
print('Complete: ', URL)
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('Wrong: Some error occurred: ', URL)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Options setting
|
||||||
|
options = Options()
|
||||||
|
options.add_argument('--headless') # Run Edge in headless mode
|
||||||
|
options.add_argument('disable-gpu') # Disable GPU acceleration
|
||||||
|
options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'
|
||||||
|
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id):
|
|||||||
time = time.get_text()
|
time = time.get_text()
|
||||||
Time.append(time)
|
Time.append(time)
|
||||||
|
|
||||||
Submitted_date = Time[0]
|
Submit_date = Time[0]
|
||||||
Publish_date = Time[-1]
|
Publish_date = Time[-1]
|
||||||
|
|
||||||
# keyword
|
# keyword
|
||||||
@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id):
|
|||||||
"title": Title,
|
"title": Title,
|
||||||
"authors": Author,
|
"authors": Author,
|
||||||
"corresponding_authors": Corresponding_author,
|
"corresponding_authors": Corresponding_author,
|
||||||
"submit_datetime": Submitted_date,
|
"submit_datetime": Submit_date,
|
||||||
"publish_datetime": Publish_date,
|
"publish_datetime": Publish_date,
|
||||||
"keywords": Keyword,
|
"keywords": Keyword,
|
||||||
"MSC": MSC,
|
"MSC": MSC,
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
import random
|
|
||||||
|
|
||||||
# 用户代理地址池
|
|
||||||
uapools=[
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
|
|
||||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
|
||||||
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
|
||||||
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
|
|
||||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
|
||||||
]
|
|
||||||
|
|
||||||
def header():
|
|
||||||
# 网站请求头
|
|
||||||
headers = {
|
|
||||||
'User-Agent': random.choice(uapools),
|
|
||||||
}
|
|
||||||
|
|
||||||
return headers
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,6 +1,28 @@
|
|||||||
|
import random
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# 用户代理地址池
|
||||||
|
uapools=[
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
|
||||||
|
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
||||||
|
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
||||||
|
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
|
||||||
|
]
|
||||||
|
|
||||||
|
def header():
|
||||||
|
# 网站请求头
|
||||||
|
headers = {
|
||||||
|
'User-Agent': random.choice(uapools),
|
||||||
|
}
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
# 标准访问格式
|
# 标准访问格式
|
||||||
def Link(url, headers):
|
def Link(url, headers):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
import SD_header
|
|
||||||
import SD_link
|
import SD_link
|
||||||
import SD_threads
|
import SD_threads
|
||||||
import SD_save
|
import SD_save
|
||||||
@ -20,8 +19,8 @@ from urllib.parse import urljoin
|
|||||||
|
|
||||||
|
|
||||||
# 存放网页链接的空列表
|
# 存放网页链接的空列表
|
||||||
Links = [] # A list for links
|
|
||||||
Webs = [] # A list for web url
|
Webs = [] # A list for web url
|
||||||
|
Links = [] # A list for links
|
||||||
|
|
||||||
# 存放爬取数据的空列表
|
# 存放爬取数据的空列表
|
||||||
Article_data = []
|
Article_data = []
|
||||||
@ -29,7 +28,7 @@ Author_data = []
|
|||||||
|
|
||||||
# ==========访问论文列表页==========
|
# ==========访问论文列表页==========
|
||||||
# 获取数学类期刊网站链接
|
# 获取数学类期刊网站链接
|
||||||
headers = SD_header.header()
|
headers = SD_link.header()
|
||||||
soup = SD_link.Link('https://www.springeropen.com/journals', headers)
|
soup = SD_link.Link('https://www.springeropen.com/journals', headers)
|
||||||
|
|
||||||
hrefs = soup.find('ol', id='Mathematics-list')
|
hrefs = soup.find('ol', id='Mathematics-list')
|
||||||
|
|||||||
@ -20,23 +20,30 @@ def Transf():
|
|||||||
os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
|
os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
|
||||||
os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
|
os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
data_oldest = []
|
||||||
|
data_2010_2014 = []
|
||||||
|
data_2015_2020 = []
|
||||||
|
data_newest = []
|
||||||
|
|
||||||
for filename in os.listdir(folder_path):
|
for filename in os.listdir(folder_path):
|
||||||
if filename.endswith('.json'):
|
if filename.endswith('.json'):
|
||||||
file_path = os.path.join(folder_path, filename)
|
file_path = os.path.join(folder_path, filename)
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||||
# 筛选文章
|
# 筛选文章
|
||||||
data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
|
||||||
|
|
||||||
data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
|
||||||
|
|
||||||
data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
|
||||||
|
|
||||||
data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
|
||||||
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
|
||||||
|
|
||||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||||
|
|||||||
@ -3,7 +3,6 @@ import urllib
|
|||||||
import uuid
|
import uuid
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import SD_header
|
|
||||||
import SD_link
|
import SD_link
|
||||||
import SD_detail
|
import SD_detail
|
||||||
import SD_save
|
import SD_save
|
||||||
@ -11,7 +10,7 @@ import SD_save
|
|||||||
# ==========获取论文详情页链接==========
|
# ==========获取论文详情页链接==========
|
||||||
def Scrawl(Link, Article_data, Author_data):
|
def Scrawl(Link, Article_data, Author_data):
|
||||||
# 访问论文列表页
|
# 访问论文列表页
|
||||||
headers = SD_header.header()
|
headers = SD_link.header()
|
||||||
soup = SD_link.Link(Link, headers)
|
soup = SD_link.Link(Link, headers)
|
||||||
print(Link)
|
print(Link)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user