Bug Fix:
1. added split author data when hits "\n" 2. added split name by "." 3. added method extracting author info when have 3 hr tag
This commit is contained in:
parent
f97195c94d
commit
68a755a633
@ -222,14 +222,14 @@ def process_article(title, article_url):
|
||||
for row in table.find_all('tr'):
|
||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||
for cell in cells:
|
||||
cell = cell.split("\n")
|
||||
cell = re.split(r'[\r\n]+', cell)
|
||||
cell = [element.replace('email: ', '') for element in cell]
|
||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||
|
||||
# Data processing
|
||||
if cell[0]:
|
||||
authors.append(cell[0])
|
||||
name = cell[0].split(" ")
|
||||
name = re.split(r'[ .]', cell[0])
|
||||
affiliation = ', '.join(cell[1:-1])
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
||||
@ -252,7 +252,11 @@ def process_article(title, article_url):
|
||||
# If no author table
|
||||
else:
|
||||
match_type = 0
|
||||
hr_count = len(soup.find_all('hr'))
|
||||
if hr_count < 3:
|
||||
pattern = r'<hr>(.*?)<hr>'
|
||||
else:
|
||||
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
|
||||
matches = str(re.findall(pattern, html, re.DOTALL))
|
||||
if len(matches) < 5:
|
||||
match_type = 1
|
||||
@ -276,7 +280,7 @@ def process_article(title, article_url):
|
||||
# Data processing
|
||||
if match[0]:
|
||||
authors.append(match[0])
|
||||
name = match[0].split(" ")
|
||||
name = re.split(r'[ .]', match[0])
|
||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user