Change the data structure
This commit is contained in:
commit
1602d03e9d
@ -7,13 +7,13 @@ import ejde_save
|
|||||||
from retrying import retry
|
from retrying import retry
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
Total number of papers: 2023/08/08 - 4300
|
Total number of papers: 2023/08/08 - 4300
|
||||||
Total Time via VPN w/119ms-delay: 441.80s
|
Total Time via VPN w/100ms-delay: 254.04s
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
@ -152,9 +152,17 @@ def process_article(title, article_url):
|
|||||||
article_soup = BeautifulSoup(html, 'html.parser')
|
article_soup = BeautifulSoup(html, 'html.parser')
|
||||||
article_text = article_soup.get_text()
|
article_text = article_soup.get_text()
|
||||||
|
|
||||||
|
# Extract title if title == None
|
||||||
|
if not title:
|
||||||
|
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||||
|
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
||||||
|
|
||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||||
volume = str(volume_match.group(1)) if volume_match else None
|
volume = str(volume_match.group(1)) if volume_match else None
|
||||||
|
if not volume:
|
||||||
|
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||||
|
volume = str(volume_match.group(1)) if volume_match else None
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||||
@ -181,17 +189,18 @@ def process_article(title, article_url):
|
|||||||
if not msc_match:
|
if not msc_match:
|
||||||
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
||||||
if msc_match:
|
if msc_match:
|
||||||
msc = msc_match.group(1).strip().strip('.').strip()
|
msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
|
||||||
|
msc = msc.strip('.').strip()
|
||||||
msc = re.split(r', |;', msc)
|
msc = re.split(r', |;', msc)
|
||||||
else:
|
else:
|
||||||
msc = []
|
msc = []
|
||||||
|
|
||||||
# Extract KeyWords
|
# Extract KeyWords
|
||||||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
|
||||||
if not keywords_match:
|
if not keywords_match:
|
||||||
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||||||
if keywords_match:
|
if keywords_match:
|
||||||
keywords = keywords_match.group(1).strip().replace('\n', '')
|
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||||||
keywords = re.split(r', |;', keywords)
|
keywords = re.split(r', |;', keywords)
|
||||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
||||||
else:
|
else:
|
||||||
@ -213,14 +222,14 @@ def process_article(title, article_url):
|
|||||||
for row in table.find_all('tr'):
|
for row in table.find_all('tr'):
|
||||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
cell = cell.split("\n")
|
cell = re.split(r'[\r\n]+', cell)
|
||||||
cell = [element.replace('email: ', '') for element in cell]
|
cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
|
||||||
cell = [c.strip() for c in cell]
|
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
|
if cell[0]:
|
||||||
authors.append(cell[0])
|
authors.append(cell[0])
|
||||||
name = cell[0].split(" ")
|
name = re.split(r'[ .]', cell[0])
|
||||||
middle_name = ''.join(name[1:-1]) if name[1:-1] else None
|
|
||||||
affiliation = ', '.join(cell[1:-1])
|
affiliation = ', '.join(cell[1:-1])
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
||||||
@ -228,10 +237,10 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": [article_id],
|
"from_article": article_id,
|
||||||
"first_name": name[0],
|
"first_name": name[0],
|
||||||
"last_name": name[-1],
|
"last_name": name[-1],
|
||||||
"middle_name": middle_name,
|
"middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
@ -241,23 +250,36 @@ def process_article(title, article_url):
|
|||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
# If no author table
|
# If no author table
|
||||||
else:
|
else:
|
||||||
|
match_type = 0
|
||||||
|
hr_count = len(article_soup.find_all('hr'))
|
||||||
|
if hr_count < 3:
|
||||||
pattern = r'<hr>(.*?)<hr>'
|
pattern = r'<hr>(.*?)<hr>'
|
||||||
|
else:
|
||||||
|
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
|
||||||
matches = str(re.findall(pattern, html, re.DOTALL))
|
matches = str(re.findall(pattern, html, re.DOTALL))
|
||||||
|
if len(matches) < 5:
|
||||||
|
match_type = 1
|
||||||
|
last_p_tag = str(article_soup.find_all('p')[-1])
|
||||||
|
pattern = r'<p>(.*?)<hr/>'
|
||||||
|
matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
|
||||||
|
|
||||||
if matches:
|
if matches:
|
||||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||||
matches = matches.split("<p>")
|
matches = matches.split("<p>")
|
||||||
|
|
||||||
for match in matches:
|
for match in matches:
|
||||||
match = re.sub(r'<[^>]+>', '', match)
|
match = re.sub(r'<[^>]+>', '', match)
|
||||||
match = match.lstrip("\\n ").rstrip("\\n ").strip()
|
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
||||||
|
if match_type == 0:
|
||||||
match = match.split("\\n")
|
match = match.split("\\n")
|
||||||
match = [element.replace('email: ', '') for element in match]
|
else:
|
||||||
match = [m.strip() for m in match]
|
match = match.split("\n")
|
||||||
|
match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
|
||||||
|
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
|
if match[0]:
|
||||||
authors.append(match[0])
|
authors.append(match[0])
|
||||||
name = match[0].split(" ")
|
name = re.split(r'[ .]', match[0])
|
||||||
middle_name = ''.join(name[1:-1]) if name[1:-1] else None
|
|
||||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
||||||
@ -265,10 +287,10 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": [article_id],
|
"from_article": article_id,
|
||||||
"first_name": name[0],
|
"first_name": name[0],
|
||||||
"last_name": name[-1],
|
"last_name": name[-1],
|
||||||
"middle_name": middle_name,
|
"middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
@ -278,7 +300,11 @@ def process_article(title, article_url):
|
|||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
else:
|
else:
|
||||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||||
return
|
fail = {
|
||||||
|
"title": title,
|
||||||
|
"URL": article_url
|
||||||
|
}
|
||||||
|
failedFormatData.append(fail)
|
||||||
|
|
||||||
# Article info
|
# Article info
|
||||||
article_data = {
|
article_data = {
|
||||||
@ -347,8 +373,6 @@ for future in as_completed(futures):
|
|||||||
except Exception as vol_err:
|
except Exception as vol_err:
|
||||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||||
|
|
||||||
wait(futures)
|
|
||||||
|
|
||||||
# Retry failed processing paper
|
# Retry failed processing paper
|
||||||
print("START RETRYING:", len(failedData))
|
print("START RETRYING:", len(failedData))
|
||||||
while failedData:
|
while failedData:
|
||||||
@ -378,15 +402,15 @@ if len(authorData) > 0:
|
|||||||
|
|
||||||
# Save error record
|
# Save error record
|
||||||
if len(totallyFailedData) > 0:
|
if len(totallyFailedData) > 0:
|
||||||
ejde_save.save_data(failedData, "", "Failed_article_record.json")
|
ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
|
||||||
print("Total failed processing paper:", len(totallyFailedData))
|
print("Total failed processing paper:", len(totallyFailedData))
|
||||||
|
|
||||||
if len(failedVolData) > 0:
|
if len(failedVolData) > 0:
|
||||||
ejde_save.save_data(failedVolData, "", "Failed_volume_record.json")
|
ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
|
||||||
print("Total failed fetching volume:", len(failedVolData))
|
print("Total failed fetching volume:", len(failedVolData))
|
||||||
|
|
||||||
if len(failedFormatData) > 0:
|
if len(failedFormatData) > 0:
|
||||||
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
|
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
|
||||||
print("Total failed searching article:", len(failedFormatData))
|
print("Total failed searching article:", len(failedFormatData))
|
||||||
|
|
||||||
# Total running time
|
# Total running time
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user