diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 2563557..078572f 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' - Total number of papers: 2023/08/08 - 4339 - Total Time via VPN w/52ms-delay: 430.38s + Total number of papers: 2023/08/08 - 4761 + Total Time via VPN w/100ms-delay: 306.73s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -37,6 +37,7 @@ def datetime_transform(date): "Spetember": "September", "Septembere": "September", "Ocotber": "October", + "Nobember": "November", } try: input_date = datetime.strptime(date, "%B %d, %Y") @@ -120,6 +121,9 @@ def process_html_article(baseweb, article): title = article.text.strip() title = re.sub(r'\s+', ' ', title).strip() article_url = baseweb + article.find_next("a")["href"] + if "../../index.html" in article_url: + print("Redundant URL:", article_url) + return except Exception as html_format_err: print("HTML FORMAT FAILURE:", str(html_format_err)) fail = { @@ -173,7 +177,18 @@ def process_article(title, article_url): volume = str(volume) issue = "Special Issue " + str(issue_number) else: - volume = None + volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text) + if volume_match: + issue = "Conference " + str(volume_match.group(1)) + volume = str(volume_match.group(2)) + else: + volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text) + if volume_match: + issue_number, volume = volume_match.groups() + volume = str(volume) + issue = "Conference " + str(issue_number) + else: + volume = None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) @@ -209,7 +224,7 @@ def process_article(title, article_url): if keywords_match: keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) - keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] + keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())] else: keywords = [] @@ -270,6 +285,7 @@ def process_article(title, article_url): }] } authorData.append(author_data) + authorNum += 1 # If no author table else: match_type = 0 @@ -335,6 +351,7 @@ def process_article(title, article_url): }] } authorData.append(author_data) + authorNum += 1 else: print("AUTHOR SEARCHING ERROR:", article_url) fail = { @@ -362,16 +379,15 @@ def process_article(title, article_url): "page": pp } articleData.append(article_data) + articleNum += 1 # Save the data periodically based on batch size if len(articleData) % batch_size == 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") - articleNum += len(articleData) articleData.clear() if len(authorData) % batch_size == 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") - authorNum += len(authorData) authorData.clear() @@ -392,7 +408,13 @@ soup = BeautifulSoup(response.content, 'html.parser') special_links = soup.find_all("a", href=True) url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]]) -# Initialize lists +# Get all conference special issues url +index = "https://ejde.math.txstate.edu/conf-toc.html#latest" +response = requests.get(index) +soup = BeautifulSoup(response.content, 'html.parser') +special_links = soup.find_all("a", href=True) +url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]]) + authorData = [] articleData = [] failedData = [] @@ -400,12 +422,11 @@ totallyFailedData = [] failedVolData = [] failedFormatData = [] -# Initialize variables for counting authorNum = 0 articleNum = 0 batch_size = 100 # Number of articles to process before saving -executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads +executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_volume, url) for url in url_list] @@ -437,12 +458,10 @@ while failedData: if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") -print("Total fetched paper:", len(articleData) + articleNum) if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") -print("Total fetched author:", len(authorData) + authorNum) # Save error record if len(totallyFailedData) > 0: @@ -457,7 +476,9 @@ if len(failedFormatData) > 0: ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) -# Total running time +# Statistics +print("Total fetched paper:", articleNum) +print("Total fetched author:", authorNum) print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files