1. added split author data when hits "\n"
2. added split name by "."
3. added method extracting author info when have 3 hr tag
This commit is contained in:
ldy 2023-08-11 19:13:33 +08:00
parent f97195c94d
commit 68a755a633

View File

@ -222,14 +222,14 @@ def process_article(title, article_url):
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
cell = cell.split("\n")
cell = re.split(r'[\r\n]+', cell)
cell = [element.replace('email: ', '') for element in cell]
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
# Data processing
if cell[0]:
authors.append(cell[0])
name = cell[0].split(" ")
name = re.split(r'[ .]', cell[0])
affiliation = ', '.join(cell[1:-1])
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
@ -252,7 +252,11 @@ def process_article(title, article_url):
# If no author table
else:
match_type = 0
pattern = r'<hr>(.*?)<hr>'
hr_count = len(soup.find_all('hr'))
if hr_count < 3:
pattern = r'<hr>(.*?)<hr>'
else:
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
matches = str(re.findall(pattern, html, re.DOTALL))
if len(matches) < 5:
match_type = 1
@ -276,7 +280,7 @@ def process_article(title, article_url):
# Data processing
if match[0]:
authors.append(match[0])
name = match[0].split(" ")
name = re.split(r'[ .]', match[0])
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])