1. unworkable retrying function back online baby
New Function:
1. reformatted datetime_transform funtion to handle more month typos
2. reformatted process_article function into 3 functions to use multi-threads better running time
3. renewed article url search technique to handle different volume websites
4. more exception handling
5. bettered keywords and affiliation strip method
6. added methods for processing author data when there exists no author table
7. added code for retry failed processing paper
8. more detailed error messages storage
This commit is contained in:
ldy 2023-08-10 01:15:17 +08:00
parent a9c753567c
commit 2c25682f81

View File

@ -1,8 +1,10 @@
import time
import uuid import uuid
import requests import requests
import re import re
import ejde_save import ejde_save
from retrying import retry
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -10,6 +12,9 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
''' '''
爬取网站'ejde.math.txstate.edu' 爬取网站'ejde.math.txstate.edu'
Total number of papers:
2023/08/08 - 4300
==========运行顺序========== ==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存 1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
2ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 2ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
@ -18,42 +23,141 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
def datetime_transform(date): def datetime_transform(date):
month_typo = {
"Janaury": "January",
"Febrary": "February",
"Februay": "February",
"Mar": "March",
"Mach": "March",
"Match": "March",
"Maay": "May",
"Jun": "June",
"Juy": "July",
"Aapril": "April",
"Spetember": "September",
"Septembere": "September",
"Ocotber": "October",
}
try: try:
input_date = datetime.strptime(date, "%B %d, %Y") input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d") return input_date.strftime("%Y-%m-%d")
# handle two month-typos except ValueError:
except ValueError as vale: for typo, correction in month_typo.items():
if "Match 7, 2012" in date: date = date.replace(typo, correction)
return "2012-03-07"
elif "Janaury 15, 2021" in date: try:
return "2021-01-15" input_date = datetime.strptime(date, "%B %d, %Y")
else: return input_date.strftime("%Y-%m-%d")
print("Month typo:", str(vale)) except ValueError as val_err:
print("TYPO:", str(val_err))
return date return date
# Article and author detail # Article and author detail
def process_article(url): def process_volume(url):
response = requests.get(url) articles = []
response.raise_for_status() baseWeb = None
retries = 5
for attempt in range(retries):
try:
volume_response = requests.get(url)
if volume_response.status_code == 200:
volume_response.raise_for_status()
baseWeb = url[:url.rfind('/')] + "/" baseWeb = url[:url.rfind('/')] + "/"
html = response.text html = volume_response.text
soup = BeautifulSoup(html, "html.parser") volume_soup = BeautifulSoup(html, "html.parser")
ol_elements = volume_soup.find_all('ol')
articles = soup.find_all("li") for ol in ol_elements:
em_elements = ol.find_all('em')
if em_elements:
articles.extend(em for em in em_elements)
# Another html style
else:
i_elements = ol.find_all('i')
if i_elements:
articles.extend(i for i in i_elements)
else:
print("HTML FORMAT FAILURE:", url)
return
break
except Exception as fetch_err:
if attempt < retries - 1:
print("RETRYING TO FETCH HTML:", str(fetch_err))
time.sleep(1)
continue
else:
print("HTML FETCHING FAILURE:", url)
fail = {
"website": url,
}
failedVolData.append(fail)
return
for article in articles: # Process each article using multithreading
authors = article.find("strong").text.strip().split(", ") volume_executor = ThreadPoolExecutor(max_workers=15)
title = article.find("em").text.strip() volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
article_url = baseWeb + article.find("a")["href"]
# Access article detail page # Wait for all tasks to complete
response = requests.get(article_url) for volume_futures in as_completed(volume_futures):
html = response.text try:
soup = BeautifulSoup(html, 'html.parser') volume_futures.result()
except Exception as html_err:
print("HTML PROCESSING ERROR:", str(html_err))
article_text = soup.get_text()
def process_html_article(baseweb, article):
global articleNum, authorNum
# Get article title & url
try:
title = article.text.strip()
title = re.sub(r'\s+', ' ', title).strip()
article_url = baseweb + article.find_next("a")["href"]
except Exception as html_format_err:
print("HTML FORMAT FAILURE:", str(html_format_err))
fail = {
"article": str(article),
}
failedFormatData.append(fail)
return
# Crawl article data
try:
process_article(title, article_url)
except Exception as article_err:
print("ARTICLE PROCESSING FAILURE:", str(article_err))
fail = {
"title": title,
"URL": article_url,
}
failedData.append(fail)
return
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleNum += len(articleData)
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorNum += len(authorData)
authorData.clear()
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
article_response = requests.get(article_url, headers=headers)
if article_response.status_code == 200:
article_response.raise_for_status()
html = article_response.text
article_soup = BeautifulSoup(html, 'html.parser')
article_text = article_soup.get_text()
# Extract volume # Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
@ -69,26 +173,22 @@ def process_article(url):
# Extract submission date # Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
submitted_date = match.group(1) submitted_date = match.group(1) if match else None
if match: if submitted_date:
submitted_date = datetime_transform(submitted_date) submitted_date = datetime_transform(submitted_date)
else:
submitted_date = None
# Extract publication date # Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1) publish_date = match.group(1) if match else None
if match: if publish_date:
publish_date = datetime_transform(publish_date) publish_date = datetime_transform(publish_date)
else:
publish_date = None
# Extract MSC # Extract MSC
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
if not msc_match: if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match: if msc_match:
msc = msc_match.group(1).strip().strip('.') msc = msc_match.group(1).strip().strip('.').strip()
msc = re.split(r', |;', msc) msc = re.split(r', |;', msc)
else: else:
msc = None msc = None
@ -100,7 +200,7 @@ def process_article(url):
if keywords_match: if keywords_match:
keywords = keywords_match.group(1).strip().replace('\n', '') keywords = keywords_match.group(1).strip().replace('\n', '')
keywords = re.split(r', |;', keywords) keywords = re.split(r', |;', keywords)
keywords = [keyword.strip().strip('.') for keyword in keywords] keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else: else:
keywords = None keywords = None
@ -113,6 +213,79 @@ def process_article(url):
# Article_id # Article_id
article_id = str(uuid.uuid4()) article_id = str(uuid.uuid4())
# Author info
authors = []
table = article_soup.find('table')
if table:
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
cell = cell.split("\n")
cell = [element.replace('email: ', '') for element in cell]
cell = [c.strip() for c in cell]
# Data processing
authors.append(cell[0])
name = cell[0].split(" ")
affiliation = ', '.join(cell[1:-1])
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
email = email_match.group() if email_match else None
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"firstname": name[0],
"lastname": name[-1],
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
}]
}
authorData.append(author_data)
# If no author table
else:
pattern = r'<hr>(.*?)<hr>'
matches = str(re.findall(pattern, html, re.DOTALL))
if matches:
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
matches = matches.split("<p>")
for match in matches:
match = re.sub(r'<[^>]+>', '', match)
match = match.lstrip("\\n ").rstrip("\\n ").strip()
match = match.split("\\n")
match = [element.replace('email: ', '') for element in match]
match = [m.strip() for m in match]
# Data processing
authors.append(match[0])
name = match[0].split(" ")
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
email = email_match.group() if email_match else None
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"firstname": name[0],
"lastname": name[-1],
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
}]
}
authorData.append(author_data)
else:
print("AUTHOR SEARCHING ERROR:", article_url)
return
# Article info
article_data = { article_data = {
"article_id": article_id, "article_id": article_id,
"title": title, "title": title,
@ -132,79 +305,79 @@ def process_article(url):
} }
articleData.append(article_data) articleData.append(article_data)
# Author info
table = soup.find('table')
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
cell = cell.split("\n")
cell = [element.replace('email: ', '') for element in cell]
cell = [c.strip() for c in cell]
# Data processing
name = cell[0].split(" ")
affiliation = ', '.join(cell[1:-1])
email = cell[-1]
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"firstname": name[0],
"lastname": name[-1],
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
}]
}
authorData.append(author_data)
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorData.clear()
index = "https://ejde.math.txstate.edu/indexleft.html" index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index) response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# Find all the URL links under the first (Volumes) section # Find all the URL links under the first Volume section
volume_links = soup.select('font > a[href]') volume_links = soup.select('font > a[href]')
# Extract and store the URLs in a list using list comprehension # Extract and store the URLs in a list using list comprehension
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1] url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
# Initialize lists
authorData = [] authorData = []
articleData = [] articleData = []
failedData = []
totallyFailedData = []
failedVolData = []
failedFormatData = []
# Initialize variables for counting
authorNum = 0
articleNum = 0
batch_size = 100 # Number of articles to process before saving batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
# Process each URL using multithreading # Process each URL using multithreading
futures = [executor.submit(process_article, url) for url in url_list] futures = [executor.submit(process_volume, url) for url in url_list]
# Wait for all tasks to complete # Wait for all tasks to complete
for future in as_completed(futures): for future in as_completed(futures):
try: try:
future.result() future.result()
except Exception as e: except Exception as vol_err:
print("An error occurred:", str(e)) print("VOLUME PROCESSING ERROR:", str(vol_err))
# Retry failed processing paper
for data in failedData:
articleTitle = data["title"]
articleUrl = data["URL"]
try:
process_article(articleTitle, articleUrl)
except Exception as retry_err:
print("ARTICLE RETRYING FAILURE:", str(retry_err))
totally_fail = {
"title": articleTitle,
"URL": articleUrl,
}
totallyFailedData.append(totally_fail)
# Save remaining data # Save remaining data
if len(articleData) > 0: if len(articleData) > 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
print("Total fetched paper:", len(articleData) + articleNum)
if len(authorData) > 0: if len(authorData) > 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
print("Total fetched author:", len(authorData) + authorNum)
# Save error record
if len(totallyFailedData) > 0:
ejde_save.save_data(failedData, "", "Failed_article_record.json")
print("Total failed processing paper:", len(totallyFailedData))
if len(failedVolData) > 0:
ejde_save.save_data(failedVolData, "", "Failed_volume_record.json")
print("Total failed fetching volume:", len(failedVolData))
if len(failedFormatData) > 0:
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
print("Total failed searching article:", len(failedFormatData))
# Transfer to large file and delete the temporary storage files # Transfer to large file and delete the temporary storage files
ejde_save.Transf() ejde_save.Transf()
ejde_save.delete() # ejde_save.delete()