2023-11-01 13:07:24 +08:00

114 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import datetime
import threading
import urllib
import ejqtde_scrawler
import ejqtde_save
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.edge.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from urllib.parse import urljoin
'''
爬取网站:'https://www.math.u-szeged.hu/ejqtde'
==========运行顺序==========
1、ejqtde_main 获取各年份的期刊链接
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件json暂存
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
'''
# Multithread pool
def extract_href(link):
driver = webdriver.Edge(options=options)
driver.get(link)
html_code = driver.page_source
soup = BeautifulSoup(html_code, 'html.parser')
column_right = soup.find('div', id='columnRight')
if column_right:
ordered_lists = column_right.find_all('ol')
for idx, ordered_list in enumerate(ordered_lists, 1):
for list_item in ordered_list.find_all('li'):
matches = re.findall(r'</a>: <a\s+href="(periodica\.html\?periodica=1&amp;'
r'paramtipus_ertek=publication&amp;param_ertek=\d+)"', str(list_item))
for match in matches:
URL = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', match)
hrefs.append(URL)
print('Links got: ', link)
driver.quit()
# Empty list
Author_list = []
Article_list = []
hrefs = []
# Base web urls
current_year = datetime.datetime.now().year
years = range(2009, current_year + 1) # years = range(2010, current_year + 1)
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
+ f'{year}' for year in years][::-1]
# Options setting
options = Options()
options.add_argument('--headless') # Run Edge in headless mode
options.add_argument('disable-gpu') # Disable GPU acceleration
options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'
# Use multithreading to process URLs concurrently
with ThreadPoolExecutor(max_workers=25) as executor:
futures = [executor.submit(extract_href, url) for url in url_list]
for future in as_completed(futures):
pass
wait(futures)
print('\nAll links have been got.\n')
# Use multithreading to get the data
count1 = 0
count2 = 0
locks = threading.Lock()
scrawl_lock = threading.Lock()
with ThreadPoolExecutor(max_workers=25) as executor:
futures = [executor.submit(ejqtde_scrawler.scrawler, href, scrawl_lock, Article_list, Author_list) for href in hrefs]
for future in as_completed(futures):
if len(Article_list) >= 50:
with locks:
count1 += len(Article_list)
ejqtde_save.save_data(Article_list, "Article_TS")
Article_list.clear()
if len(Author_list) >= 50:
with locks:
count2 += len(Author_list)
ejqtde_save.save_data(Author_list, "Author_TS")
Author_list.clear()
wait(futures)
# Deal with the remaining data
if len(Article_list) > 0:
count1 += len(Article_list)
ejqtde_save.save_data(Article_list, "Article_TS")
Article_list.clear()
print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
if len(Author_list) > 0:
count2 += len(Author_list)
ejqtde_save.save_data(Author_list, "Author_TS")
Author_list.clear()
print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')
print('\nThe whole scrawler program has been done\n')
print(count1, ' article_data has been stored.')
print(count2, ' author_data has been stored.')
# Transfer to large file and delete the temporary storage files
ejqtde_save.Transf()
ejqtde_save.delete()