Optimization:
1. added new regular expression format for volume 2. added new strip method for msc 3. deleted blank-space author 4. optimized middle name strip method 5. added new matching pattern for no table author list 6. added exception storing for AUTHOR SEARCHING ERROR Bug fix: 1. error record saving
This commit is contained in:
parent
69b10a9f72
commit
3e78e9f48e
@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers: 2023/08/08 - 4300
|
||||
Total Time via VPN w/119ms-delay: 441.80s
|
||||
Total Time via VPN w/100ms-delay: 254.04s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -156,7 +156,8 @@ def process_article(title, article_url):
|
||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||
volume = str(volume_match.group(1)) if volume_match else None
|
||||
if not volume:
|
||||
volume = str(volume_match.group(2)) if volume_match else None
|
||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||
volume = str(volume_match.group(1)) if volume_match else None
|
||||
|
||||
# Extract pp
|
||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||
@ -183,7 +184,8 @@ def process_article(title, article_url):
|
||||
if not msc_match:
|
||||
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
||||
if msc_match:
|
||||
msc = msc_match.group(1).strip().strip('.').strip()
|
||||
msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
|
||||
msc = msc.strip('.').strip()
|
||||
msc = re.split(r', |;', msc)
|
||||
else:
|
||||
msc = None
|
||||
@ -217,9 +219,10 @@ def process_article(title, article_url):
|
||||
for cell in cells:
|
||||
cell = cell.split("\n")
|
||||
cell = [element.replace('email: ', '') for element in cell]
|
||||
cell = [c.strip() for c in cell]
|
||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||
|
||||
# Data processing
|
||||
if cell[0]:
|
||||
authors.append(cell[0])
|
||||
name = cell[0].split(" ")
|
||||
affiliation = ', '.join(cell[1:-1])
|
||||
@ -232,7 +235,8 @@ def process_article(title, article_url):
|
||||
"from_article": article_id,
|
||||
"firstname": name[0],
|
||||
"lastname": name[-1],
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
|
||||
name) > 2 else None,
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
@ -242,20 +246,30 @@ def process_article(title, article_url):
|
||||
authorData.append(author_data)
|
||||
# If no author table
|
||||
else:
|
||||
match_type = 0
|
||||
pattern = r'<hr>(.*?)<hr>'
|
||||
matches = str(re.findall(pattern, html, re.DOTALL))
|
||||
if len(matches) < 5:
|
||||
match_type = 1
|
||||
last_p_tag = str(article_soup.find_all('p')[-1])
|
||||
pattern = r'<p>(.*?)<hr/>'
|
||||
matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
|
||||
|
||||
if matches:
|
||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||
matches = matches.split("<p>")
|
||||
|
||||
for match in matches:
|
||||
match = re.sub(r'<[^>]+>', '', match)
|
||||
match = match.lstrip("\\n ").rstrip("\\n ").strip()
|
||||
if match_type == 0:
|
||||
match = match.split("\\n")
|
||||
else:
|
||||
match = match.split("\n")
|
||||
match = [element.replace('email: ', '') for element in match]
|
||||
match = [m.strip() for m in match]
|
||||
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||||
|
||||
# Data processing
|
||||
if match[0]:
|
||||
authors.append(match[0])
|
||||
name = match[0].split(" ")
|
||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||||
@ -268,7 +282,8 @@ def process_article(title, article_url):
|
||||
"from_article": article_id,
|
||||
"firstname": name[0],
|
||||
"lastname": name[-1],
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
|
||||
name) > 2 else None,
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
@ -278,7 +293,11 @@ def process_article(title, article_url):
|
||||
authorData.append(author_data)
|
||||
else:
|
||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||
return
|
||||
fail = {
|
||||
"title": title,
|
||||
"URL": article_url
|
||||
}
|
||||
failedFormatData.append(fail)
|
||||
|
||||
# Article info
|
||||
article_data = {
|
||||
@ -376,7 +395,7 @@ if len(authorData) > 0:
|
||||
|
||||
# Save error record
|
||||
if len(totallyFailedData) > 0:
|
||||
ejde_save.save_data(failedData, "Error", "Failed_article_record.json")
|
||||
ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
|
||||
print("Total failed processing paper:", len(totallyFailedData))
|
||||
|
||||
if len(failedVolData) > 0:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user