From 083e6c87eb4da86d996adc791f3b618ddebe076e Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 20:45:04 +0800 Subject: [PATCH] Optimization: strip "\newline" in author name --- 01_EJDE_spider/ejde_main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index bd8429c..17577cf 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -223,7 +223,7 @@ def process_article(title, article_url): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: cell = re.split(r'[\r\n]+', cell) - cell = [element.replace('email: ', '') for element in cell] + cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing @@ -240,8 +240,8 @@ def process_article(title, article_url): "from_article": article_id, "firstname": name[0], "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( - name) > 2 else None, + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0 + ] if len(name) > 2 else None, "affiliation": [{ "year": volume, "affiliation": affiliation, @@ -269,12 +269,12 @@ def process_article(title, article_url): matches = matches.split("
") for match in matches: match = re.sub(r'<[^>]+>', '', match) - match = match.lstrip("\\n ").rstrip("\\n ").strip() + match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() if match_type == 0: match = match.split("\\n") else: match = match.split("\n") - match = [element.replace('email: ', '') for element in match] + match = [m.replace('email: ', '').replace('\\newline', '') for m in match] match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing @@ -291,8 +291,8 @@ def process_article(title, article_url): "from_article": article_id, "firstname": name[0], "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( - name) > 2 else None, + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0 + ] if len(name) > 2 else None, "affiliation": [{ "year": volume, "affiliation": affiliation, @@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -# ejde_save.delete() +ejde_save.delete()