Optimization:
1. add conference special issue papers 2. optimized counting process 3. enhanced saving robustness
This commit is contained in:
parent
e217342ce2
commit
b5ce290ea5
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers: 2023/08/08 - 4339
|
||||
Total Time via VPN w/52ms-delay: 430.38s
|
||||
Total number of papers: 2023/08/08 - 4761
|
||||
Total Time via VPN w/100ms-delay: 306.73s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -37,6 +37,7 @@ def datetime_transform(date):
|
||||
"Spetember": "September",
|
||||
"Septembere": "September",
|
||||
"Ocotber": "October",
|
||||
"Nobember": "November",
|
||||
}
|
||||
try:
|
||||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||||
@ -120,6 +121,9 @@ def process_html_article(baseweb, article):
|
||||
title = article.text.strip()
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
article_url = baseweb + article.find_next("a")["href"]
|
||||
if "../../index.html" in article_url:
|
||||
print("Redundant URL:", article_url)
|
||||
return
|
||||
except Exception as html_format_err:
|
||||
print("HTML FORMAT FAILURE:", str(html_format_err))
|
||||
fail = {
|
||||
@ -172,6 +176,17 @@ def process_article(title, article_url):
|
||||
issue_number, volume = volume_match.groups()
|
||||
volume = str(volume)
|
||||
issue = "Special Issue " + str(issue_number)
|
||||
else:
|
||||
volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
|
||||
if volume_match:
|
||||
issue = "Conference " + str(volume_match.group(1))
|
||||
volume = str(volume_match.group(2))
|
||||
else:
|
||||
volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
|
||||
if volume_match:
|
||||
issue_number, volume = volume_match.groups()
|
||||
volume = str(volume)
|
||||
issue = "Conference " + str(issue_number)
|
||||
else:
|
||||
volume = None
|
||||
|
||||
@ -209,7 +224,7 @@ def process_article(title, article_url):
|
||||
if keywords_match:
|
||||
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||||
keywords = re.split(r', |;', keywords)
|
||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
|
||||
else:
|
||||
keywords = []
|
||||
|
||||
@ -270,6 +285,7 @@ def process_article(title, article_url):
|
||||
}]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
# If no author table
|
||||
else:
|
||||
match_type = 0
|
||||
@ -335,6 +351,7 @@ def process_article(title, article_url):
|
||||
}]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
else:
|
||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||
fail = {
|
||||
@ -362,16 +379,15 @@ def process_article(title, article_url):
|
||||
"page": pp
|
||||
}
|
||||
articleData.append(article_data)
|
||||
articleNum += 1
|
||||
|
||||
# Save the data periodically based on batch size
|
||||
if len(articleData) % batch_size == 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
articleNum += len(articleData)
|
||||
articleData.clear()
|
||||
|
||||
if len(authorData) % batch_size == 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
authorNum += len(authorData)
|
||||
authorData.clear()
|
||||
|
||||
|
||||
@ -392,7 +408,13 @@ soup = BeautifulSoup(response.content, 'html.parser')
|
||||
special_links = soup.find_all("a", href=True)
|
||||
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
|
||||
|
||||
# Initialize lists
|
||||
# Get all conference special issues url
|
||||
index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
|
||||
response = requests.get(index)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
special_links = soup.find_all("a", href=True)
|
||||
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])
|
||||
|
||||
authorData = []
|
||||
articleData = []
|
||||
failedData = []
|
||||
@ -400,12 +422,11 @@ totallyFailedData = []
|
||||
failedVolData = []
|
||||
failedFormatData = []
|
||||
|
||||
# Initialize variables for counting
|
||||
authorNum = 0
|
||||
articleNum = 0
|
||||
|
||||
batch_size = 100 # Number of articles to process before saving
|
||||
executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads
|
||||
executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads
|
||||
|
||||
# Process each URL using multithreading
|
||||
futures = [executor.submit(process_volume, url) for url in url_list]
|
||||
@ -437,12 +458,10 @@ while failedData:
|
||||
if len(articleData) > 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
print("Total fetched paper:", len(articleData) + articleNum)
|
||||
|
||||
if len(authorData) > 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
print("Total fetched author:", len(authorData) + authorNum)
|
||||
|
||||
# Save error record
|
||||
if len(totallyFailedData) > 0:
|
||||
@ -457,7 +476,9 @@ if len(failedFormatData) > 0:
|
||||
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
|
||||
print("Total failed searching article:", len(failedFormatData))
|
||||
|
||||
# Total running time
|
||||
# Statistics
|
||||
print("Total fetched paper:", articleNum)
|
||||
print("Total fetched author:", authorNum)
|
||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user