Optimization:

1. add conference special issue papers
2. optimized counting process
3. enhanced saving robustness
This commit is contained in:
ldy 2023-08-20 16:48:43 +08:00
parent e217342ce2
commit b5ce290ea5

View File

@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
''' '''
爬取网站'ejde.math.txstate.edu' 爬取网站'ejde.math.txstate.edu'
Total number of papers: 2023/08/08 - 4339 Total number of papers: 2023/08/08 - 4761
Total Time via VPN w/52ms-delay: 430.38s Total Time via VPN w/100ms-delay: 306.73s
==========运行顺序========== ==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存 1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
@ -37,6 +37,7 @@ def datetime_transform(date):
"Spetember": "September", "Spetember": "September",
"Septembere": "September", "Septembere": "September",
"Ocotber": "October", "Ocotber": "October",
"Nobember": "November",
} }
try: try:
input_date = datetime.strptime(date, "%B %d, %Y") input_date = datetime.strptime(date, "%B %d, %Y")
@ -120,6 +121,9 @@ def process_html_article(baseweb, article):
title = article.text.strip() title = article.text.strip()
title = re.sub(r'\s+', ' ', title).strip() title = re.sub(r'\s+', ' ', title).strip()
article_url = baseweb + article.find_next("a")["href"] article_url = baseweb + article.find_next("a")["href"]
if "../../index.html" in article_url:
print("Redundant URL:", article_url)
return
except Exception as html_format_err: except Exception as html_format_err:
print("HTML FORMAT FAILURE:", str(html_format_err)) print("HTML FORMAT FAILURE:", str(html_format_err))
fail = { fail = {
@ -173,7 +177,18 @@ def process_article(title, article_url):
volume = str(volume) volume = str(volume)
issue = "Special Issue " + str(issue_number) issue = "Special Issue " + str(issue_number)
else: else:
volume = None volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
if volume_match:
issue = "Conference " + str(volume_match.group(1))
volume = str(volume_match.group(2))
else:
volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
if volume_match:
issue_number, volume = volume_match.groups()
volume = str(volume)
issue = "Conference " + str(issue_number)
else:
volume = None
# Extract pp # Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text) pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@ -209,7 +224,7 @@ def process_article(title, article_url):
if keywords_match: if keywords_match:
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords) keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
else: else:
keywords = [] keywords = []
@ -270,6 +285,7 @@ def process_article(title, article_url):
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
authorNum += 1
# If no author table # If no author table
else: else:
match_type = 0 match_type = 0
@ -335,6 +351,7 @@ def process_article(title, article_url):
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
authorNum += 1
else: else:
print("AUTHOR SEARCHING ERROR:", article_url) print("AUTHOR SEARCHING ERROR:", article_url)
fail = { fail = {
@ -362,16 +379,15 @@ def process_article(title, article_url):
"page": pp "page": pp
} }
articleData.append(article_data) articleData.append(article_data)
articleNum += 1
# Save the data periodically based on batch size # Save the data periodically based on batch size
if len(articleData) % batch_size == 0: if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleNum += len(articleData)
articleData.clear() articleData.clear()
if len(authorData) % batch_size == 0: if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorNum += len(authorData)
authorData.clear() authorData.clear()
@ -392,7 +408,13 @@ soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True) special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]]) url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
# Initialize lists # Get all conference special issues url
index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])
authorData = [] authorData = []
articleData = [] articleData = []
failedData = [] failedData = []
@ -400,12 +422,11 @@ totallyFailedData = []
failedVolData = [] failedVolData = []
failedFormatData = [] failedFormatData = []
# Initialize variables for counting
authorNum = 0 authorNum = 0
articleNum = 0 articleNum = 0
batch_size = 100 # Number of articles to process before saving batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads
# Process each URL using multithreading # Process each URL using multithreading
futures = [executor.submit(process_volume, url) for url in url_list] futures = [executor.submit(process_volume, url) for url in url_list]
@ -437,12 +458,10 @@ while failedData:
if len(articleData) > 0: if len(articleData) > 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
print("Total fetched paper:", len(articleData) + articleNum)
if len(authorData) > 0: if len(authorData) > 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
print("Total fetched author:", len(authorData) + authorNum)
# Save error record # Save error record
if len(totallyFailedData) > 0: if len(totallyFailedData) > 0:
@ -457,7 +476,9 @@ if len(failedFormatData) > 0:
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json") ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
print("Total failed searching article:", len(failedFormatData)) print("Total failed searching article:", len(failedFormatData))
# Total running time # Statistics
print("Total fetched paper:", articleNum)
print("Total fetched author:", authorNum)
print("time elapsed: {:.2f}s".format(time.time() - start_time)) print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files # Transfer to large file and delete the temporary storage files