Optimization:

less memory usage
data collection for volume HTML format error
added time elapse monitor
This commit is contained in:
ldy 2023-08-10 12:57:28 +08:00
parent 2c25682f81
commit 71e613d994

View File

@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
'''
爬取网站'ejde.math.txstate.edu'
Total number of papers:
2023/08/08 - 4300
Total number of papers: 2023/08/08 - 4300
Total Time via VPN w/119ms-delay: 441.80s
==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
@ -81,6 +81,10 @@ def process_volume(url):
articles.extend(i for i in i_elements)
else:
print("HTML FORMAT FAILURE:", url)
fail = {
"website": url
}
failedFormatData.append(fail)
return
break
except Exception as fetch_err:
@ -91,12 +95,12 @@ def process_volume(url):
else:
print("HTML FETCHING FAILURE:", url)
fail = {
"website": url,
"website": url
}
failedVolData.append(fail)
return
# Process each article using multithreading
# Process each article using multithreading (>20 threads would cause more error)
volume_executor = ThreadPoolExecutor(max_workers=15)
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
@ -109,7 +113,6 @@ def process_volume(url):
def process_html_article(baseweb, article):
global articleNum, authorNum
# Get article title & url
try:
title = article.text.strip()
@ -118,7 +121,7 @@ def process_html_article(baseweb, article):
except Exception as html_format_err:
print("HTML FORMAT FAILURE:", str(html_format_err))
fail = {
"article": str(article),
"article": str(article)
}
failedFormatData.append(fail)
return
@ -130,25 +133,15 @@ def process_html_article(baseweb, article):
print("ARTICLE PROCESSING FAILURE:", str(article_err))
fail = {
"title": title,
"URL": article_url,
"URL": article_url
}
failedData.append(fail)
return
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleNum += len(articleData)
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorNum += len(authorData)
authorData.clear()
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
global articleNum, authorNum
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
article_response = requests.get(article_url, headers=headers)
@ -241,7 +234,7 @@ def process_article(title, article_url):
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
"email": email
}]
}
authorData.append(author_data)
@ -277,7 +270,7 @@ def process_article(title, article_url):
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
"email": email
}]
}
authorData.append(author_data)
@ -301,11 +294,23 @@ def process_article(title, article_url):
"journal": "Electronic Journal of Differential Equations",
"volume": volume,
"issue": issue,
"page": pp,
"page": pp
}
articleData.append(article_data)
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleNum += len(articleData)
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorNum += len(authorData)
authorData.clear()
start_time = time.time()
index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
@ -341,7 +346,9 @@ for future in as_completed(futures):
print("VOLUME PROCESSING ERROR:", str(vol_err))
# Retry failed processing paper
for data in failedData:
print("START RETRYING:", len(failedData))
while failedData:
data = failedData.pop(0)
articleTitle = data["title"]
articleUrl = data["URL"]
try:
@ -350,7 +357,7 @@ for data in failedData:
print("ARTICLE RETRYING FAILURE:", str(retry_err))
totally_fail = {
"title": articleTitle,
"URL": articleUrl,
"URL": articleUrl
}
totallyFailedData.append(totally_fail)
@ -378,6 +385,9 @@ if len(failedFormatData) > 0:
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
print("Total failed searching article:", len(failedFormatData))
# Total running time
print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files
ejde_save.Transf()
# ejde_save.delete()