Optimization:
less memory usage data collection for volume HTML format error added time elapse monitor
This commit is contained in:
parent
2c25682f81
commit
71e613d994
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
Total number of papers:
|
Total number of papers: 2023/08/08 - 4300
|
||||||
2023/08/08 - 4300
|
Total Time via VPN w/119ms-delay: 441.80s
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
@ -81,6 +81,10 @@ def process_volume(url):
|
|||||||
articles.extend(i for i in i_elements)
|
articles.extend(i for i in i_elements)
|
||||||
else:
|
else:
|
||||||
print("HTML FORMAT FAILURE:", url)
|
print("HTML FORMAT FAILURE:", url)
|
||||||
|
fail = {
|
||||||
|
"website": url
|
||||||
|
}
|
||||||
|
failedFormatData.append(fail)
|
||||||
return
|
return
|
||||||
break
|
break
|
||||||
except Exception as fetch_err:
|
except Exception as fetch_err:
|
||||||
@ -91,12 +95,12 @@ def process_volume(url):
|
|||||||
else:
|
else:
|
||||||
print("HTML FETCHING FAILURE:", url)
|
print("HTML FETCHING FAILURE:", url)
|
||||||
fail = {
|
fail = {
|
||||||
"website": url,
|
"website": url
|
||||||
}
|
}
|
||||||
failedVolData.append(fail)
|
failedVolData.append(fail)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Process each article using multithreading
|
# Process each article using multithreading (>20 threads would cause more error)
|
||||||
volume_executor = ThreadPoolExecutor(max_workers=15)
|
volume_executor = ThreadPoolExecutor(max_workers=15)
|
||||||
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
|
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
|
||||||
|
|
||||||
@ -109,7 +113,6 @@ def process_volume(url):
|
|||||||
|
|
||||||
|
|
||||||
def process_html_article(baseweb, article):
|
def process_html_article(baseweb, article):
|
||||||
global articleNum, authorNum
|
|
||||||
# Get article title & url
|
# Get article title & url
|
||||||
try:
|
try:
|
||||||
title = article.text.strip()
|
title = article.text.strip()
|
||||||
@ -118,7 +121,7 @@ def process_html_article(baseweb, article):
|
|||||||
except Exception as html_format_err:
|
except Exception as html_format_err:
|
||||||
print("HTML FORMAT FAILURE:", str(html_format_err))
|
print("HTML FORMAT FAILURE:", str(html_format_err))
|
||||||
fail = {
|
fail = {
|
||||||
"article": str(article),
|
"article": str(article)
|
||||||
}
|
}
|
||||||
failedFormatData.append(fail)
|
failedFormatData.append(fail)
|
||||||
return
|
return
|
||||||
@ -130,25 +133,15 @@ def process_html_article(baseweb, article):
|
|||||||
print("ARTICLE PROCESSING FAILURE:", str(article_err))
|
print("ARTICLE PROCESSING FAILURE:", str(article_err))
|
||||||
fail = {
|
fail = {
|
||||||
"title": title,
|
"title": title,
|
||||||
"URL": article_url,
|
"URL": article_url
|
||||||
}
|
}
|
||||||
failedData.append(fail)
|
failedData.append(fail)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Save the data periodically based on batch size
|
|
||||||
if len(articleData) % batch_size == 0:
|
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
|
||||||
articleNum += len(articleData)
|
|
||||||
articleData.clear()
|
|
||||||
|
|
||||||
if len(authorData) % batch_size == 0:
|
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
|
||||||
authorNum += len(authorData)
|
|
||||||
authorData.clear()
|
|
||||||
|
|
||||||
|
|
||||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||||
def process_article(title, article_url):
|
def process_article(title, article_url):
|
||||||
|
global articleNum, authorNum
|
||||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||||
article_response = requests.get(article_url, headers=headers)
|
article_response = requests.get(article_url, headers=headers)
|
||||||
@ -241,7 +234,7 @@ def process_article(title, article_url):
|
|||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
"email": email,
|
"email": email
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
@ -277,7 +270,7 @@ def process_article(title, article_url):
|
|||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
"email": email,
|
"email": email
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
@ -301,11 +294,23 @@ def process_article(title, article_url):
|
|||||||
"journal": "Electronic Journal of Differential Equations",
|
"journal": "Electronic Journal of Differential Equations",
|
||||||
"volume": volume,
|
"volume": volume,
|
||||||
"issue": issue,
|
"issue": issue,
|
||||||
"page": pp,
|
"page": pp
|
||||||
}
|
}
|
||||||
articleData.append(article_data)
|
articleData.append(article_data)
|
||||||
|
|
||||||
|
# Save the data periodically based on batch size
|
||||||
|
if len(articleData) % batch_size == 0:
|
||||||
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
|
articleNum += len(articleData)
|
||||||
|
articleData.clear()
|
||||||
|
|
||||||
|
if len(authorData) % batch_size == 0:
|
||||||
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
|
authorNum += len(authorData)
|
||||||
|
authorData.clear()
|
||||||
|
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||||||
response = requests.get(index)
|
response = requests.get(index)
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
@ -341,7 +346,9 @@ for future in as_completed(futures):
|
|||||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||||
|
|
||||||
# Retry failed processing paper
|
# Retry failed processing paper
|
||||||
for data in failedData:
|
print("START RETRYING:", len(failedData))
|
||||||
|
while failedData:
|
||||||
|
data = failedData.pop(0)
|
||||||
articleTitle = data["title"]
|
articleTitle = data["title"]
|
||||||
articleUrl = data["URL"]
|
articleUrl = data["URL"]
|
||||||
try:
|
try:
|
||||||
@ -350,7 +357,7 @@ for data in failedData:
|
|||||||
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
||||||
totally_fail = {
|
totally_fail = {
|
||||||
"title": articleTitle,
|
"title": articleTitle,
|
||||||
"URL": articleUrl,
|
"URL": articleUrl
|
||||||
}
|
}
|
||||||
totallyFailedData.append(totally_fail)
|
totallyFailedData.append(totally_fail)
|
||||||
|
|
||||||
@ -378,6 +385,9 @@ if len(failedFormatData) > 0:
|
|||||||
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
|
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
|
||||||
print("Total failed searching article:", len(failedFormatData))
|
print("Total failed searching article:", len(failedFormatData))
|
||||||
|
|
||||||
|
# Total running time
|
||||||
|
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||||
|
|
||||||
# Transfer to large file and delete the temporary storage files
|
# Transfer to large file and delete the temporary storage files
|
||||||
ejde_save.Transf()
|
ejde_save.Transf()
|
||||||
# ejde_save.delete()
|
# ejde_save.delete()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user