113 lines
4.1 KiB
Python
113 lines
4.1 KiB
Python
import re
|
||
import datetime
|
||
import threading
|
||
import urllib
|
||
import ejqtde_scrawler
|
||
import ejqtde_save
|
||
|
||
from selenium import webdriver
|
||
from bs4 import BeautifulSoup
|
||
from selenium.webdriver.edge.options import Options
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||
from urllib.parse import urljoin
|
||
|
||
'''
|
||
爬取网站:'https://www.math.u-szeged.hu/ejqtde'
|
||
|
||
==========运行顺序==========
|
||
1、ejqtde_main 获取各年份的期刊链接
|
||
2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存
|
||
3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||
*4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||
'''
|
||
|
||
|
||
# Multithread pool
|
||
def extract_href(link):
|
||
driver = webdriver.Edge(options=options)
|
||
driver.get(link)
|
||
html_code = driver.page_source
|
||
soup = BeautifulSoup(html_code, 'html.parser')
|
||
column_right = soup.find('div', id='columnRight')
|
||
if column_right:
|
||
ordered_lists = column_right.find_all('ol')
|
||
for idx, ordered_list in enumerate(ordered_lists, 1):
|
||
for list_item in ordered_list.find_all('li'):
|
||
matches = re.findall(r'</a>: <a\s+href="(periodica\.html\?periodica=1&'
|
||
r'paramtipus_ertek=publication&param_ertek=\d+)"', str(list_item))
|
||
for match in matches:
|
||
URL = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', match)
|
||
hrefs.append(URL)
|
||
|
||
print('Links got: ', link)
|
||
driver.quit()
|
||
|
||
|
||
# Empty list
|
||
Author_list = []
|
||
Article_list = []
|
||
hrefs = []
|
||
|
||
# Base web urls
|
||
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
|
||
current_year = datetime.datetime.now().year
|
||
years = range(2009, 2011) # years = range(2010, current_year + 1)
|
||
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1¶mtipus_ertek=publications¶m_ertek='
|
||
+ f'{year}' for year in years][::-1]
|
||
|
||
# Options setting
|
||
options = Options()
|
||
options.add_argument('--headless') # Run Edge in headless mode
|
||
options.add_argument('disable-gpu') # Disable GPU acceleration
|
||
options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'
|
||
|
||
# Use multithreading to process URLs concurrently
|
||
with ThreadPoolExecutor(max_workers=25) as executor:
|
||
futures = [executor.submit(extract_href, url) for url in url_list]
|
||
for future in as_completed(futures):
|
||
pass
|
||
|
||
wait(futures)
|
||
print('\nAll links have been got.\n')
|
||
|
||
# Use multithreading to get the data
|
||
count1 = 0
|
||
count2 = 0
|
||
locks = threading.Lock()
|
||
scrawl_lock = threading.Lock()
|
||
|
||
with ThreadPoolExecutor(max_workers=25) as executor:
|
||
futures = [executor.submit(ejqtde_scrawler.scrawler, href, scrawl_lock, Article_list, Author_list) for href in hrefs]
|
||
for future in as_completed(futures):
|
||
if len(Article_list) >= 50:
|
||
with locks:
|
||
count1 += len(Article_list)
|
||
ejqtde_save.save_data(Article_list, "Article_TS")
|
||
Article_list.clear()
|
||
|
||
if len(Author_list) >= 50:
|
||
with locks:
|
||
count2 += len(Author_list)
|
||
ejqtde_save.save_data(Author_list, "Author_TS")
|
||
Author_list.clear()
|
||
wait(futures)
|
||
|
||
# Deal with the remaining data
|
||
if len(Article_list) > 0:
|
||
count1 += len(Article_list)
|
||
ejqtde_save.save_data(Article_list, "Article_TS")
|
||
Article_list.clear()
|
||
print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
|
||
if len(Author_list) > 0:
|
||
count2 += len(Author_list)
|
||
ejqtde_save.save_data(Author_list, "Author_TS")
|
||
Author_list.clear()
|
||
print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')
|
||
|
||
print('\nThe whole scrawler program has been done\n')
|
||
print(count1, ' article_data has been stored.')
|
||
print(count2, ' author_data has been stored.')
|
||
|
||
# Transfer to large file and delete the temporary storage files
|
||
ejqtde_save.Transf()
|
||
ejqtde_save.delete() |