Optimization:

strip "\newline" in author name
2023-08-11 20:45:04 +08:00 · 2023-08-11 20:45:04 +08:00 · 083e6c87eb
commit 083e6c87eb
parent ed469ee362
1 changed files with 8 additions and 8 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -223,7 +223,7 @@ def process_article(title, article_url):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
                    cell = re.split(r'[\r\n]+', cell)
-                    cell = [element.replace('email: ', '') for element in cell]
+                    cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
                    # Data processing
@ -240,8 +240,8 @@ def process_article(title, article_url):
                            "from_article": article_id,
                            "firstname": name[0],
                            "lastname": name[-1],
-                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
-                                name) > 2 else None,
+                                           ] if len(name) > 2 else None,
                            "affiliation": [{
                                "year": volume,
                                "affiliation": affiliation,
@ -269,12 +269,12 @@ def process_article(title, article_url):
                matches = matches.split("<p>")
                for match in matches:
                    match = re.sub(r'<[^>]+>', '', match)
-                    match = match.lstrip("\\n ").rstrip("\\n ").strip()
+                    match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
                    if match_type == 0:
                        match = match.split("\\n")
                    else:
                        match = match.split("\n")
-                    match = [element.replace('email: ', '') for element in match]
+                    match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
                    match = [re.sub(r'\s+', ' ', m).strip() for m in match]
                    # Data processing
@ -291,8 +291,8 @@ def process_article(title, article_url):
                            "from_article": article_id,
                            "firstname": name[0],
                            "lastname": name[-1],
-                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
-                                name) > 2 else None,
+                                           ] if len(name) > 2 else None,
                            "affiliation": [{
                                "year": volume,
                                "affiliation": affiliation,
@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
 # Transfer to large file and delete the temporary storage files
 ejde_save.Transf()
-# ejde_save.delete()
+ejde_save.delete()