Bug Fix:
1. added split author data when hits "\n" 2. added split name by "." 3. added method extracting author info when have 3 hr tag
This commit is contained in:
parent
f97195c94d
commit
68a755a633
@ -222,14 +222,14 @@ def process_article(title, article_url):
|
|||||||
for row in table.find_all('tr'):
|
for row in table.find_all('tr'):
|
||||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
cell = cell.split("\n")
|
cell = re.split(r'[\r\n]+', cell)
|
||||||
cell = [element.replace('email: ', '') for element in cell]
|
cell = [element.replace('email: ', '') for element in cell]
|
||||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if cell[0]:
|
if cell[0]:
|
||||||
authors.append(cell[0])
|
authors.append(cell[0])
|
||||||
name = cell[0].split(" ")
|
name = re.split(r'[ .]', cell[0])
|
||||||
affiliation = ', '.join(cell[1:-1])
|
affiliation = ', '.join(cell[1:-1])
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
||||||
@ -252,7 +252,11 @@ def process_article(title, article_url):
|
|||||||
# If no author table
|
# If no author table
|
||||||
else:
|
else:
|
||||||
match_type = 0
|
match_type = 0
|
||||||
pattern = r'<hr>(.*?)<hr>'
|
hr_count = len(soup.find_all('hr'))
|
||||||
|
if hr_count < 3:
|
||||||
|
pattern = r'<hr>(.*?)<hr>'
|
||||||
|
else:
|
||||||
|
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
|
||||||
matches = str(re.findall(pattern, html, re.DOTALL))
|
matches = str(re.findall(pattern, html, re.DOTALL))
|
||||||
if len(matches) < 5:
|
if len(matches) < 5:
|
||||||
match_type = 1
|
match_type = 1
|
||||||
@ -276,7 +280,7 @@ def process_article(title, article_url):
|
|||||||
# Data processing
|
# Data processing
|
||||||
if match[0]:
|
if match[0]:
|
||||||
authors.append(match[0])
|
authors.append(match[0])
|
||||||
name = match[0].split(" ")
|
name = re.split(r'[ .]', match[0])
|
||||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user