import re import datetime import threading import urllib import ejqtde_scrawler import ejqtde_save from selenium import webdriver from bs4 import BeautifulSoup from selenium.webdriver.edge.options import Options from concurrent.futures import ThreadPoolExecutor, as_completed, wait from urllib.parse import urljoin ''' 爬取网站:'https://www.math.u-szeged.hu/ejqtde ==========运行顺序========== 1、ejqtde_href_multithread 获取各年份的期刊链接 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) ''' # Multithread pool def extract_href(link): driver = webdriver.Edge(options=options) driver.get(link) html_code = driver.page_source soup = BeautifulSoup(html_code, 'html.parser') column_right = soup.find('div', id='columnRight') if column_right: ordered_lists = column_right.find_all('ol') for idx, ordered_list in enumerate(ordered_lists, 1): for list_item in ordered_list.find_all('li'): matches = re.findall(r': = 50: with locks: count1 += len(Article_list) ejqtde_save.save_data(Article_list, "Article_TS") Article_list.clear() if len(Author_list) >= 50: with locks: count2 += len(Author_list) ejqtde_save.save_data(Author_list, "Author_TS") Author_list.clear() wait(futures) # Deal with the remaining data if len(Article_list) > 0: count1 += len(Article_list) ejqtde_save.save_data(Article_list, "Article_TS") Article_list.clear() print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/') if len(Author_list) > 0: count2 += len(Author_list) ejqtde_save.save_data(Author_list, "Author_TS") Author_list.clear() print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/') print('\nThe whole scrawler program has been done\n') print(count1, ' article_data has been stored.') print(count2, ' author_data has been stored.') ejqtde_save.Transf() ejqtde_save.delete()