Optimization:

strip "\newline" in author name
This commit is contained in:
ldy 2023-08-11 20:45:04 +08:00
parent ed469ee362
commit 083e6c87eb

View File

@ -223,7 +223,7 @@ def process_article(title, article_url):
cells = [cell.text.strip() for cell in row.find_all('td')] cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells: for cell in cells:
cell = re.split(r'[\r\n]+', cell) cell = re.split(r'[\r\n]+', cell)
cell = [element.replace('email: ', '') for element in cell] cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
# Data processing # Data processing
@ -240,8 +240,8 @@ def process_article(title, article_url):
"from_article": article_id, "from_article": article_id,
"firstname": name[0], "firstname": name[0],
"lastname": name[-1], "lastname": name[-1],
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
name) > 2 else None, ] if len(name) > 2 else None,
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
@ -269,12 +269,12 @@ def process_article(title, article_url):
matches = matches.split("<p>") matches = matches.split("<p>")
for match in matches: for match in matches:
match = re.sub(r'<[^>]+>', '', match) match = re.sub(r'<[^>]+>', '', match)
match = match.lstrip("\\n ").rstrip("\\n ").strip() match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
if match_type == 0: if match_type == 0:
match = match.split("\\n") match = match.split("\\n")
else: else:
match = match.split("\n") match = match.split("\n")
match = [element.replace('email: ', '') for element in match] match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
match = [re.sub(r'\s+', ' ', m).strip() for m in match] match = [re.sub(r'\s+', ' ', m).strip() for m in match]
# Data processing # Data processing
@ -291,8 +291,8 @@ def process_article(title, article_url):
"from_article": article_id, "from_article": article_id,
"firstname": name[0], "firstname": name[0],
"lastname": name[-1], "lastname": name[-1],
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
name) > 2 else None, ] if len(name) > 2 else None,
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": affiliation,
@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files # Transfer to large file and delete the temporary storage files
ejde_save.Transf() ejde_save.Transf()
# ejde_save.delete() ejde_save.delete()