Optimization:
less memory usage data collection for volume HTML format error added time elapse monitor
This commit is contained in:
parent
2c25682f81
commit
71e613d994
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers:
|
||||
2023/08/08 - 4300
|
||||
Total number of papers: 2023/08/08 - 4300
|
||||
Total Time via VPN w/119ms-delay: 441.80s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -81,6 +81,10 @@ def process_volume(url):
|
||||
articles.extend(i for i in i_elements)
|
||||
else:
|
||||
print("HTML FORMAT FAILURE:", url)
|
||||
fail = {
|
||||
"website": url
|
||||
}
|
||||
failedFormatData.append(fail)
|
||||
return
|
||||
break
|
||||
except Exception as fetch_err:
|
||||
@ -91,12 +95,12 @@ def process_volume(url):
|
||||
else:
|
||||
print("HTML FETCHING FAILURE:", url)
|
||||
fail = {
|
||||
"website": url,
|
||||
"website": url
|
||||
}
|
||||
failedVolData.append(fail)
|
||||
return
|
||||
|
||||
# Process each article using multithreading
|
||||
# Process each article using multithreading (>20 threads would cause more error)
|
||||
volume_executor = ThreadPoolExecutor(max_workers=15)
|
||||
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
|
||||
|
||||
@ -109,7 +113,6 @@ def process_volume(url):
|
||||
|
||||
|
||||
def process_html_article(baseweb, article):
|
||||
global articleNum, authorNum
|
||||
# Get article title & url
|
||||
try:
|
||||
title = article.text.strip()
|
||||
@ -118,7 +121,7 @@ def process_html_article(baseweb, article):
|
||||
except Exception as html_format_err:
|
||||
print("HTML FORMAT FAILURE:", str(html_format_err))
|
||||
fail = {
|
||||
"article": str(article),
|
||||
"article": str(article)
|
||||
}
|
||||
failedFormatData.append(fail)
|
||||
return
|
||||
@ -130,25 +133,15 @@ def process_html_article(baseweb, article):
|
||||
print("ARTICLE PROCESSING FAILURE:", str(article_err))
|
||||
fail = {
|
||||
"title": title,
|
||||
"URL": article_url,
|
||||
"URL": article_url
|
||||
}
|
||||
failedData.append(fail)
|
||||
return
|
||||
|
||||
# Save the data periodically based on batch size
|
||||
if len(articleData) % batch_size == 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
articleNum += len(articleData)
|
||||
articleData.clear()
|
||||
|
||||
if len(authorData) % batch_size == 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
authorNum += len(authorData)
|
||||
authorData.clear()
|
||||
|
||||
|
||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||
def process_article(title, article_url):
|
||||
global articleNum, authorNum
|
||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||
article_response = requests.get(article_url, headers=headers)
|
||||
@ -241,7 +234,7 @@ def process_article(title, article_url):
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
"email": email,
|
||||
"email": email
|
||||
}]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
@ -277,7 +270,7 @@ def process_article(title, article_url):
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
"email": email,
|
||||
"email": email
|
||||
}]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
@ -301,11 +294,23 @@ def process_article(title, article_url):
|
||||
"journal": "Electronic Journal of Differential Equations",
|
||||
"volume": volume,
|
||||
"issue": issue,
|
||||
"page": pp,
|
||||
"page": pp
|
||||
}
|
||||
articleData.append(article_data)
|
||||
|
||||
# Save the data periodically based on batch size
|
||||
if len(articleData) % batch_size == 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
articleNum += len(articleData)
|
||||
articleData.clear()
|
||||
|
||||
if len(authorData) % batch_size == 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
authorNum += len(authorData)
|
||||
authorData.clear()
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||||
response = requests.get(index)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
@ -341,7 +346,9 @@ for future in as_completed(futures):
|
||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||
|
||||
# Retry failed processing paper
|
||||
for data in failedData:
|
||||
print("START RETRYING:", len(failedData))
|
||||
while failedData:
|
||||
data = failedData.pop(0)
|
||||
articleTitle = data["title"]
|
||||
articleUrl = data["URL"]
|
||||
try:
|
||||
@ -350,7 +357,7 @@ for data in failedData:
|
||||
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
||||
totally_fail = {
|
||||
"title": articleTitle,
|
||||
"URL": articleUrl,
|
||||
"URL": articleUrl
|
||||
}
|
||||
totallyFailedData.append(totally_fail)
|
||||
|
||||
@ -378,6 +385,9 @@ if len(failedFormatData) > 0:
|
||||
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
|
||||
print("Total failed searching article:", len(failedFormatData))
|
||||
|
||||
# Total running time
|
||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejde_save.Transf()
|
||||
# ejde_save.delete()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user