diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 53228d4..2ea14c9 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -222,14 +222,14 @@ def process_article(title, article_url): for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: - cell = cell.split("\n") + cell = re.split(r'[\r\n]+', cell) cell = [element.replace('email: ', '') for element in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing if cell[0]: authors.append(cell[0]) - name = cell[0].split(" ") + name = re.split(r'[ .]', cell[0]) affiliation = ', '.join(cell[1:-1]) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) @@ -252,7 +252,11 @@ def process_article(title, article_url): # If no author table else: match_type = 0 - pattern = r'