Optimization:
1. add conference special issue papers 2. optimized counting process 3. enhanced saving robustness
This commit is contained in:
parent
e217342ce2
commit
b5ce290ea5
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
Total number of papers: 2023/08/08 - 4339
|
Total number of papers: 2023/08/08 - 4761
|
||||||
Total Time via VPN w/52ms-delay: 430.38s
|
Total Time via VPN w/100ms-delay: 306.73s
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
@ -37,6 +37,7 @@ def datetime_transform(date):
|
|||||||
"Spetember": "September",
|
"Spetember": "September",
|
||||||
"Septembere": "September",
|
"Septembere": "September",
|
||||||
"Ocotber": "October",
|
"Ocotber": "October",
|
||||||
|
"Nobember": "November",
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
input_date = datetime.strptime(date, "%B %d, %Y")
|
input_date = datetime.strptime(date, "%B %d, %Y")
|
||||||
@ -120,6 +121,9 @@ def process_html_article(baseweb, article):
|
|||||||
title = article.text.strip()
|
title = article.text.strip()
|
||||||
title = re.sub(r'\s+', ' ', title).strip()
|
title = re.sub(r'\s+', ' ', title).strip()
|
||||||
article_url = baseweb + article.find_next("a")["href"]
|
article_url = baseweb + article.find_next("a")["href"]
|
||||||
|
if "../../index.html" in article_url:
|
||||||
|
print("Redundant URL:", article_url)
|
||||||
|
return
|
||||||
except Exception as html_format_err:
|
except Exception as html_format_err:
|
||||||
print("HTML FORMAT FAILURE:", str(html_format_err))
|
print("HTML FORMAT FAILURE:", str(html_format_err))
|
||||||
fail = {
|
fail = {
|
||||||
@ -172,6 +176,17 @@ def process_article(title, article_url):
|
|||||||
issue_number, volume = volume_match.groups()
|
issue_number, volume = volume_match.groups()
|
||||||
volume = str(volume)
|
volume = str(volume)
|
||||||
issue = "Special Issue " + str(issue_number)
|
issue = "Special Issue " + str(issue_number)
|
||||||
|
else:
|
||||||
|
volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
|
||||||
|
if volume_match:
|
||||||
|
issue = "Conference " + str(volume_match.group(1))
|
||||||
|
volume = str(volume_match.group(2))
|
||||||
|
else:
|
||||||
|
volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
|
||||||
|
if volume_match:
|
||||||
|
issue_number, volume = volume_match.groups()
|
||||||
|
volume = str(volume)
|
||||||
|
issue = "Conference " + str(issue_number)
|
||||||
else:
|
else:
|
||||||
volume = None
|
volume = None
|
||||||
|
|
||||||
@ -209,7 +224,7 @@ def process_article(title, article_url):
|
|||||||
if keywords_match:
|
if keywords_match:
|
||||||
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||||||
keywords = re.split(r', |;', keywords)
|
keywords = re.split(r', |;', keywords)
|
||||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
|
||||||
else:
|
else:
|
||||||
keywords = []
|
keywords = []
|
||||||
|
|
||||||
@ -270,6 +285,7 @@ def process_article(title, article_url):
|
|||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
|
authorNum += 1
|
||||||
# If no author table
|
# If no author table
|
||||||
else:
|
else:
|
||||||
match_type = 0
|
match_type = 0
|
||||||
@ -335,6 +351,7 @@ def process_article(title, article_url):
|
|||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
|
authorNum += 1
|
||||||
else:
|
else:
|
||||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||||
fail = {
|
fail = {
|
||||||
@ -362,16 +379,15 @@ def process_article(title, article_url):
|
|||||||
"page": pp
|
"page": pp
|
||||||
}
|
}
|
||||||
articleData.append(article_data)
|
articleData.append(article_data)
|
||||||
|
articleNum += 1
|
||||||
|
|
||||||
# Save the data periodically based on batch size
|
# Save the data periodically based on batch size
|
||||||
if len(articleData) % batch_size == 0:
|
if len(articleData) % batch_size == 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
articleNum += len(articleData)
|
|
||||||
articleData.clear()
|
articleData.clear()
|
||||||
|
|
||||||
if len(authorData) % batch_size == 0:
|
if len(authorData) % batch_size == 0:
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
authorNum += len(authorData)
|
|
||||||
authorData.clear()
|
authorData.clear()
|
||||||
|
|
||||||
|
|
||||||
@ -392,7 +408,13 @@ soup = BeautifulSoup(response.content, 'html.parser')
|
|||||||
special_links = soup.find_all("a", href=True)
|
special_links = soup.find_all("a", href=True)
|
||||||
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
|
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
|
||||||
|
|
||||||
# Initialize lists
|
# Get all conference special issues url
|
||||||
|
index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
|
||||||
|
response = requests.get(index)
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
special_links = soup.find_all("a", href=True)
|
||||||
|
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])
|
||||||
|
|
||||||
authorData = []
|
authorData = []
|
||||||
articleData = []
|
articleData = []
|
||||||
failedData = []
|
failedData = []
|
||||||
@ -400,12 +422,11 @@ totallyFailedData = []
|
|||||||
failedVolData = []
|
failedVolData = []
|
||||||
failedFormatData = []
|
failedFormatData = []
|
||||||
|
|
||||||
# Initialize variables for counting
|
|
||||||
authorNum = 0
|
authorNum = 0
|
||||||
articleNum = 0
|
articleNum = 0
|
||||||
|
|
||||||
batch_size = 100 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
futures = [executor.submit(process_volume, url) for url in url_list]
|
futures = [executor.submit(process_volume, url) for url in url_list]
|
||||||
@ -437,12 +458,10 @@ while failedData:
|
|||||||
if len(articleData) > 0:
|
if len(articleData) > 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||||
print("Total fetched paper:", len(articleData) + articleNum)
|
|
||||||
|
|
||||||
if len(authorData) > 0:
|
if len(authorData) > 0:
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||||
print("Total fetched author:", len(authorData) + authorNum)
|
|
||||||
|
|
||||||
# Save error record
|
# Save error record
|
||||||
if len(totallyFailedData) > 0:
|
if len(totallyFailedData) > 0:
|
||||||
@ -457,7 +476,9 @@ if len(failedFormatData) > 0:
|
|||||||
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
|
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
|
||||||
print("Total failed searching article:", len(failedFormatData))
|
print("Total failed searching article:", len(failedFormatData))
|
||||||
|
|
||||||
# Total running time
|
# Statistics
|
||||||
|
print("Total fetched paper:", articleNum)
|
||||||
|
print("Total fetched author:", authorNum)
|
||||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||||
|
|
||||||
# Transfer to large file and delete the temporary storage files
|
# Transfer to large file and delete the temporary storage files
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user