Optimization:

less memory usage
data collection for volume HTML format error
added time elapse monitor
This commit is contained in:
ldy 2023-08-10 12:57:28 +08:00
parent 2c25682f81
commit 71e613d994

View File

@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
''' '''
爬取网站'ejde.math.txstate.edu' 爬取网站'ejde.math.txstate.edu'
Total number of papers: Total number of papers: 2023/08/08 - 4300
2023/08/08 - 4300 Total Time via VPN w/119ms-delay: 441.80s
==========运行顺序========== ==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存 1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
@ -81,6 +81,10 @@ def process_volume(url):
articles.extend(i for i in i_elements) articles.extend(i for i in i_elements)
else: else:
print("HTML FORMAT FAILURE:", url) print("HTML FORMAT FAILURE:", url)
fail = {
"website": url
}
failedFormatData.append(fail)
return return
break break
except Exception as fetch_err: except Exception as fetch_err:
@ -91,12 +95,12 @@ def process_volume(url):
else: else:
print("HTML FETCHING FAILURE:", url) print("HTML FETCHING FAILURE:", url)
fail = { fail = {
"website": url, "website": url
} }
failedVolData.append(fail) failedVolData.append(fail)
return return
# Process each article using multithreading # Process each article using multithreading (>20 threads would cause more error)
volume_executor = ThreadPoolExecutor(max_workers=15) volume_executor = ThreadPoolExecutor(max_workers=15)
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles] volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
@ -109,7 +113,6 @@ def process_volume(url):
def process_html_article(baseweb, article): def process_html_article(baseweb, article):
global articleNum, authorNum
# Get article title & url # Get article title & url
try: try:
title = article.text.strip() title = article.text.strip()
@ -118,7 +121,7 @@ def process_html_article(baseweb, article):
except Exception as html_format_err: except Exception as html_format_err:
print("HTML FORMAT FAILURE:", str(html_format_err)) print("HTML FORMAT FAILURE:", str(html_format_err))
fail = { fail = {
"article": str(article), "article": str(article)
} }
failedFormatData.append(fail) failedFormatData.append(fail)
return return
@ -130,25 +133,15 @@ def process_html_article(baseweb, article):
print("ARTICLE PROCESSING FAILURE:", str(article_err)) print("ARTICLE PROCESSING FAILURE:", str(article_err))
fail = { fail = {
"title": title, "title": title,
"URL": article_url, "URL": article_url
} }
failedData.append(fail) failedData.append(fail)
return return
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleNum += len(articleData)
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorNum += len(authorData)
authorData.clear()
@retry(wait_fixed=5000, stop_max_attempt_number=5) @retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url): def process_article(title, article_url):
global articleNum, authorNum
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
article_response = requests.get(article_url, headers=headers) article_response = requests.get(article_url, headers=headers)
@ -241,7 +234,7 @@ def process_article(title, article_url):
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
"email": email, "email": email
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
@ -277,7 +270,7 @@ def process_article(title, article_url):
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
"email": email, "email": email
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
@ -301,11 +294,23 @@ def process_article(title, article_url):
"journal": "Electronic Journal of Differential Equations", "journal": "Electronic Journal of Differential Equations",
"volume": volume, "volume": volume,
"issue": issue, "issue": issue,
"page": pp, "page": pp
} }
articleData.append(article_data) articleData.append(article_data)
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleNum += len(articleData)
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorNum += len(authorData)
authorData.clear()
start_time = time.time()
index = "https://ejde.math.txstate.edu/indexleft.html" index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index) response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
@ -341,7 +346,9 @@ for future in as_completed(futures):
print("VOLUME PROCESSING ERROR:", str(vol_err)) print("VOLUME PROCESSING ERROR:", str(vol_err))
# Retry failed processing paper # Retry failed processing paper
for data in failedData: print("START RETRYING:", len(failedData))
while failedData:
data = failedData.pop(0)
articleTitle = data["title"] articleTitle = data["title"]
articleUrl = data["URL"] articleUrl = data["URL"]
try: try:
@ -350,7 +357,7 @@ for data in failedData:
print("ARTICLE RETRYING FAILURE:", str(retry_err)) print("ARTICLE RETRYING FAILURE:", str(retry_err))
totally_fail = { totally_fail = {
"title": articleTitle, "title": articleTitle,
"URL": articleUrl, "URL": articleUrl
} }
totallyFailedData.append(totally_fail) totallyFailedData.append(totally_fail)
@ -378,6 +385,9 @@ if len(failedFormatData) > 0:
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
print("Total failed searching article:", len(failedFormatData)) print("Total failed searching article:", len(failedFormatData))
# Total running time
print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files # Transfer to large file and delete the temporary storage files
ejde_save.Transf() ejde_save.Transf()
# ejde_save.delete() # ejde_save.delete()