ScholarDataMining/EJQTDE_spider/ejqtde_main.py

import re
import datetime
import threading
import urllib
import ejqtde_scrawler
import ejqtde_save

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.edge.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from urllib.parse import urljoin

'''
    爬取网站：'https://www.math.u-szeged.hu/ejqtde'

    ==========运行顺序==========
    1、ejqtde_main                      获取各年份的期刊链接
    2、ejqtde_scrawler                  抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
    3、ejqtde_save                      从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *4、ejqtde_save.delete()(可选)      删除暂存区内部所有文件（注意备份）
'''


# Multithread pool
def extract_href(link):
    driver = webdriver.Edge(options=options)
    driver.get(link)
    html_code = driver.page_source
    soup = BeautifulSoup(html_code, 'html.parser')
    column_right = soup.find('div', id='columnRight')
    if column_right:
        ordered_lists = column_right.find_all('ol')
        for idx, ordered_list in enumerate(ordered_lists, 1):
            for list_item in ordered_list.find_all('li'):
                matches = re.findall(r'</a>: <a\s+href="(periodica\.html\?periodica=1&amp;'
                                     r'paramtipus_ertek=publication&amp;param_ertek=\d+)"', str(list_item))
                for match in matches:
                    URL = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', match)
                    hrefs.append(URL)

        print('Links got: ', link)
    driver.quit()


# Empty list
Author_list = []
Article_list = []
hrefs = []

# Base web urls
baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
current_year = datetime.datetime.now().year
years = range(2009, 2011)            # years = range(2010, current_year + 1)
url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
            + f'{year}' for year in years][::-1]

# Options setting
options = Options()
options.add_argument('--headless')  # Run Edge in headless mode
options.add_argument('disable-gpu')  # Disable GPU acceleration
options.add_argument('pageLoadStrategy=none')  # Set page load strategy to 'none'

# Use multithreading to process URLs concurrently
with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(extract_href, url) for url in url_list]
    for future in as_completed(futures):
        pass

    wait(futures)
print('\nAll links have been got.\n')

# Use multithreading to get the data
count1 = 0
count2 = 0
locks = threading.Lock()
scrawl_lock = threading.Lock()

with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(ejqtde_scrawler.scrawler, href, scrawl_lock, Article_list, Author_list) for href in hrefs]
    for future in as_completed(futures):
        if len(Article_list) >= 50:
            with locks:
                count1 += len(Article_list)
                ejqtde_save.save_data(Article_list, "Article_TS")
                Article_list.clear()

        if len(Author_list) >= 50:
            with locks:
                count2 += len(Author_list)
                ejqtde_save.save_data(Author_list, "Author_TS")
                Author_list.clear()
    wait(futures)

    # Deal with the remaining data
    if len(Article_list) > 0:
        count1 += len(Article_list)
        ejqtde_save.save_data(Article_list, "Article_TS")
        Article_list.clear()
        print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
    if len(Author_list) > 0:
        count2 += len(Author_list)
        ejqtde_save.save_data(Author_list, "Author_TS")
        Author_list.clear()
        print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')

print('\nThe whole scrawler program has been done\n')
print(count1, ' article_data has been stored.')
print(count2, ' author_data has been stored.')

# Transfer to large file and delete the temporary storage files
ejqtde_save.Transf()
ejqtde_save.delete()