Change the data structure

This commit is contained in:
XCX 2023-08-13 21:36:22 +08:00
commit 1602d03e9d

View File

@ -7,13 +7,13 @@ import ejde_save
from retrying import retry from retrying import retry
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed, wait from concurrent.futures import ThreadPoolExecutor, as_completed
''' '''
爬取网站'ejde.math.txstate.edu' 爬取网站'ejde.math.txstate.edu'
Total number of papers: 2023/08/08 - 4300 Total number of papers: 2023/08/08 - 4300
Total Time via VPN w/119ms-delay: 441.80s Total Time via VPN w/100ms-delay: 254.04s
==========运行顺序========== ==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存 1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
@ -152,9 +152,17 @@ def process_article(title, article_url):
article_soup = BeautifulSoup(html, 'html.parser') article_soup = BeautifulSoup(html, 'html.parser')
article_text = article_soup.get_text() article_text = article_soup.get_text()
# Extract title if title == None
if not title:
title_match = re.search(r"<h3>(.*?)<p>", article_text)
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
# Extract volume # Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = str(volume_match.group(1)) if volume_match else None volume = str(volume_match.group(1)) if volume_match else None
if not volume:
volume_match = re.search(r'Vol\. (\d+)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
# Extract pp # Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text) pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@ -181,17 +189,18 @@ def process_article(title, article_url):
if not msc_match: if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match: if msc_match:
msc = msc_match.group(1).strip().strip('.').strip() msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
msc = msc.strip('.').strip()
msc = re.split(r', |;', msc) msc = re.split(r', |;', msc)
else: else:
msc = [] msc = []
# Extract KeyWords # Extract KeyWords
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
if not keywords_match: if not keywords_match:
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL) keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if keywords_match: if keywords_match:
keywords = keywords_match.group(1).strip().replace('\n', '') keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords) keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else: else:
@ -213,72 +222,89 @@ def process_article(title, article_url):
for row in table.find_all('tr'): for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')] cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells: for cell in cells:
cell = cell.split("\n") cell = re.split(r'[\r\n]+', cell)
cell = [element.replace('email: ', '') for element in cell] cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
cell = [c.strip() for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
# Data processing # Data processing
authors.append(cell[0]) if cell[0]:
name = cell[0].split(" ") authors.append(cell[0])
middle_name = ''.join(name[1:-1]) if name[1:-1] else None name = re.split(r'[ .]', cell[0])
affiliation = ', '.join(cell[1:-1]) affiliation = ', '.join(cell[1:-1])
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
email = email_match.group() if email_match else None email = email_match.group() if email_match else None
author_data = { author_data = {
"author_id": str(uuid.uuid4()), "author_id": str(uuid.uuid4()),
"from_article": [article_id], "from_article": article_id,
"first_name": name[0], "first_name": name[0],
"last_name": name[-1], "last_name": name[-1],
"middle_name": middle_name, "middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
"email": email "email": email
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
# If no author table # If no author table
else: else:
pattern = r'<hr>(.*?)<hr>' match_type = 0
hr_count = len(article_soup.find_all('hr'))
if hr_count < 3:
pattern = r'<hr>(.*?)<hr>'
else:
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
matches = str(re.findall(pattern, html, re.DOTALL)) matches = str(re.findall(pattern, html, re.DOTALL))
if len(matches) < 5:
match_type = 1
last_p_tag = str(article_soup.find_all('p')[-1])
pattern = r'<p>(.*?)<hr/>'
matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
if matches: if matches:
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
matches = matches.split("<p>") matches = matches.split("<p>")
for match in matches: for match in matches:
match = re.sub(r'<[^>]+>', '', match) match = re.sub(r'<[^>]+>', '', match)
match = match.lstrip("\\n ").rstrip("\\n ").strip() match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
match = match.split("\\n") if match_type == 0:
match = [element.replace('email: ', '') for element in match] match = match.split("\\n")
match = [m.strip() for m in match] else:
match = match.split("\n")
match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
# Data processing # Data processing
authors.append(match[0]) if match[0]:
name = match[0].split(" ") authors.append(match[0])
middle_name = ''.join(name[1:-1]) if name[1:-1] else None name = re.split(r'[ .]', match[0])
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
email = email_match.group() if email_match else None email = email_match.group() if email_match else None
author_data = { author_data = {
"author_id": str(uuid.uuid4()), "author_id": str(uuid.uuid4()),
"from_article": [article_id], "from_article": article_id,
"first_name": name[0], "first_name": name[0],
"last_name": name[-1], "last_name": name[-1],
"middle_name": middle_name, "middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
"email": email "email": email
}] }]
} }
authorData.append(author_data) authorData.append(author_data)
else: else:
print("AUTHOR SEARCHING ERROR:", article_url) print("AUTHOR SEARCHING ERROR:", article_url)
return fail = {
"title": title,
"URL": article_url
}
failedFormatData.append(fail)
# Article info # Article info
article_data = { article_data = {
@ -347,8 +373,6 @@ for future in as_completed(futures):
except Exception as vol_err: except Exception as vol_err:
print("VOLUME PROCESSING ERROR:", str(vol_err)) print("VOLUME PROCESSING ERROR:", str(vol_err))
wait(futures)
# Retry failed processing paper # Retry failed processing paper
print("START RETRYING:", len(failedData)) print("START RETRYING:", len(failedData))
while failedData: while failedData:
@ -378,15 +402,15 @@ if len(authorData) > 0:
# Save error record # Save error record
if len(totallyFailedData) > 0: if len(totallyFailedData) > 0:
ejde_save.save_data(failedData, "", "Failed_article_record.json") ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
print("Total failed processing paper:", len(totallyFailedData)) print("Total failed processing paper:", len(totallyFailedData))
if len(failedVolData) > 0: if len(failedVolData) > 0:
ejde_save.save_data(failedVolData, "", "Failed_volume_record.json") ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
print("Total failed fetching volume:", len(failedVolData)) print("Total failed fetching volume:", len(failedVolData))
if len(failedFormatData) > 0: if len(failedFormatData) > 0:
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
print("Total failed searching article:", len(failedFormatData)) print("Total failed searching article:", len(failedFormatData))
# Total running time # Total running time
@ -394,4 +418,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files # Transfer to large file and delete the temporary storage files
ejde_save.Transf() ejde_save.Transf()
ejde_save.delete() ejde_save.delete()