From 68a755a633b155aa10cb8c048cc7358b12b24304 Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 19:13:33 +0800 Subject: [PATCH] Bug Fix: 1. added split author data when hits "\n" 2. added split name by "." 3. added method extracting author info when have 3 hr tag --- 01_EJDE_spider/ejde_main.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 53228d4..2ea14c9 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -222,14 +222,14 @@ def process_article(title, article_url): for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: - cell = cell.split("\n") + cell = re.split(r'[\r\n]+', cell) cell = [element.replace('email: ', '') for element in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing if cell[0]: authors.append(cell[0]) - name = cell[0].split(" ") + name = re.split(r'[ .]', cell[0]) affiliation = ', '.join(cell[1:-1]) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) @@ -252,7 +252,11 @@ def process_article(title, article_url): # If no author table else: match_type = 0 - pattern = r'