Optimization:

1. add special issue papers
2. optimized doi regular expression
Bug fix:
1. multiple email problem
Reminder:
1. author's email data structure changed due to multiple email problem
This commit is contained in:
ldy 2023-08-19 21:15:12 +08:00
parent 88bcbf5b8f
commit ba3671b5fd

View File

@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
''' '''
爬取网站'ejde.math.txstate.edu' 爬取网站'ejde.math.txstate.edu'
Total number of papers: 2023/08/08 - 4300 Total number of papers: 2023/08/08 - 4339
Total Time via VPN w/100ms-delay: 254.04s Total Time via VPN w/52ms-delay: 430.38s
==========运行顺序========== ==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存 1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
@ -68,15 +68,17 @@ def process_volume(url):
baseWeb = url[:url.rfind('/')] + "/" baseWeb = url[:url.rfind('/')] + "/"
html = volume_response.text html = volume_response.text
volume_soup = BeautifulSoup(html, "html.parser") volume_soup = BeautifulSoup(html, "html.parser")
ol_elements = volume_soup.find_all('ol') li_elements = volume_soup.find_all('ol')
if not li_elements:
li_elements = volume_soup.find_all('ul')
for ol in ol_elements: for li in li_elements:
em_elements = ol.find_all('em') em_elements = li.find_all('em')
if em_elements: if em_elements:
articles.extend(em for em in em_elements) articles.extend(em for em in em_elements)
# Another html style # Another html style
else: else:
i_elements = ol.find_all('i') i_elements = li.find_all('i')
if i_elements: if i_elements:
articles.extend(i for i in i_elements) articles.extend(i for i in i_elements)
else: else:
@ -157,21 +159,26 @@ def process_article(title, article_url):
title_match = re.search(r"<h3>(.*?)<p>", article_text) title_match = re.search(r"<h3>(.*?)<p>", article_text)
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else None
# Extract volume # Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
if not volume:
volume_match = re.search(r'Vol\. (\d+)', article_text) volume_match = re.search(r'Vol\. (\d+)', article_text)
volume = str(volume_match.group(1)) if volume_match else None volume = str(volume_match.group(1)) if volume_match else None
if not volume:
volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
if volume_match:
issue_number, volume = volume_match.groups()
volume = str(volume)
issue = "Special Issue " + str(issue_number)
else:
volume = None
# Extract pp # Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text) pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
pp = pp_match.group(1) if pp_match else None pp = pp_match.group(1) if pp_match else None
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else None
# Extract submission date # Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
submitted_date = match.group(1) if match else None submitted_date = match.group(1) if match else None
@ -207,7 +214,7 @@ def process_article(title, article_url):
keywords = [] keywords = []
# Extract DOI # Extract DOI
doi_match = re.search(r'DOI: (.+)(?=<)', html) doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
if not doi_match: if not doi_match:
doi_match = re.search(r'DOI: (.+)', html) doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else None doi = doi_match.group(1) if doi_match else None
@ -222,18 +229,33 @@ def process_article(title, article_url):
for row in table.find_all('tr'): for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')] cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells: for cell in cells:
if "email:" in cell:
cell = cell.split("email:")
email_list = str(cell[1]).split(',')
cell = cell[0]
elif "e-mail:" in cell:
cell = cell.split("e-mail:")
email_list = str(cell[1]).split(',')
cell = cell[0]
else:
email_list = None
cell = re.split(r'[\r\n]+', cell) cell = re.split(r'[\r\n]+', cell)
cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell] cell = [c.replace('\\newline', '') for c in cell]
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
# Data processing # Data processing
if cell[0]: if cell[0]:
authors.append(cell[0]) authors.append(cell[0])
name = re.split(r'[ .]', cell[0]) name = re.split(r'[ .]', cell[0])
affiliation = ', '.join(cell[1:-1]) affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) affiliation = affiliation.lstrip(",").rstrip(",").strip()
email = email_match.group() if email_match else None emails = []
if email_list:
for email in email_list:
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
emails.append(email_match.group()) if email_match else None
author_data = { author_data = {
"author_id": str(uuid.uuid4()), "author_id": str(uuid.uuid4()),
@ -244,7 +266,7 @@ def process_article(title, article_url):
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
"email": email "email": emails
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
@ -267,23 +289,38 @@ def process_article(title, article_url):
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
matches = matches.split("<p>") matches = matches.split("<p>")
for match in matches: for match in matches:
if "email:" in match:
match = match.split("email:")
email_list = str(match[1]).split(',')
match = match[0]
elif "e-mail:" in match:
match = match.split("e-mail:")
email_list = str(match[1]).split(',')
match = match[0]
else:
email_list = None
match = re.sub(r'<[^>]+>', '', match) match = re.sub(r'<[^>]+>', '', match)
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
if match_type == 0: if match_type == 0:
match = match.split("\\n") match = match.split("\\n")
else: else:
match = match.split("\n") match = match.split("\n")
match = [m.replace('email: ', '').replace('\\newline', '') for m in match] match = [m.replace('\\newline', '') for m in match]
match = [re.sub(r'\s+', ' ', m).strip() for m in match] match = [re.sub(r'\s+', ' ', m).strip() for m in match]
# Data processing # Data processing
if match[0]: if match[0]:
authors.append(match[0]) authors.append(match[0])
name = re.split(r'[ .]', match[0]) name = re.split(r'[ .]', match[0])
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) affiliation = affiliation.lstrip(",").rstrip(",").strip()
email = email_match.group() if email_match else None emails = []
if email_list:
for email in email_list:
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
emails.append(email_match.group()) if email_match else None
author_data = { author_data = {
"author_id": str(uuid.uuid4()), "author_id": str(uuid.uuid4()),
@ -294,7 +331,7 @@ def process_article(title, article_url):
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
"email": email "email": emails
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
@ -339,14 +376,21 @@ def process_article(title, article_url):
start_time = time.time() start_time = time.time()
url_list = []
# Get all general volumes url
index = "https://ejde.math.txstate.edu/indexleft.html" index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index) response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# Find all the URL links under the first Volume section
volume_links = soup.select('font > a[href]') volume_links = soup.select('font > a[href]')
# Extract and store the URLs in a list using list comprehension url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
# Get all special issues url
index = "https://ejde.math.txstate.edu/special-toc.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
# Initialize lists # Initialize lists
authorData = [] authorData = []
@ -361,7 +405,7 @@ authorNum = 0
articleNum = 0 articleNum = 0
batch_size = 100 # Number of articles to process before saving batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads
# Process each URL using multithreading # Process each URL using multithreading
futures = [executor.submit(process_volume, url) for url in url_list] futures = [executor.submit(process_volume, url) for url in url_list]
@ -393,12 +437,12 @@ while failedData:
if len(articleData) > 0: if len(articleData) > 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
print("Total fetched paper:", len(articleData) + articleNum) print("Total fetched paper:", len(articleData) + articleNum)
if len(authorData) > 0: if len(authorData) > 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
print("Total fetched author:", len(authorData) + authorNum) print("Total fetched author:", len(authorData) + authorNum)
# Save error record # Save error record
if len(totallyFailedData) > 0: if len(totallyFailedData) > 0: