diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 53228d4..2ea14c9 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -222,14 +222,14 @@ def process_article(title, article_url): for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: - cell = cell.split("\n") + cell = re.split(r'[\r\n]+', cell) cell = [element.replace('email: ', '') for element in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing if cell[0]: authors.append(cell[0]) - name = cell[0].split(" ") + name = re.split(r'[ .]', cell[0]) affiliation = ', '.join(cell[1:-1]) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) @@ -252,7 +252,11 @@ def process_article(title, article_url): # If no author table else: match_type = 0 - pattern = r'
(.*?)
' + hr_count = len(soup.find_all('hr')) + if hr_count < 3: + pattern = r'
(.*?)
' + else: + pattern = r'
(?:.*
)(.*)(?=
)' matches = str(re.findall(pattern, html, re.DOTALL)) if len(matches) < 5: match_type = 1 @@ -276,7 +280,7 @@ def process_article(title, article_url): # Data processing if match[0]: authors.append(match[0]) - name = match[0].split(" ") + name = re.split(r'[ .]', match[0]) affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])