Optimization:
strip "\newline" in author name
This commit is contained in:
parent
ed469ee362
commit
083e6c87eb
@ -223,7 +223,7 @@ def process_article(title, article_url):
|
||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||
for cell in cells:
|
||||
cell = re.split(r'[\r\n]+', cell)
|
||||
cell = [element.replace('email: ', '') for element in cell]
|
||||
cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
|
||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||
|
||||
# Data processing
|
||||
@ -240,8 +240,8 @@ def process_article(title, article_url):
|
||||
"from_article": article_id,
|
||||
"firstname": name[0],
|
||||
"lastname": name[-1],
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
|
||||
name) > 2 else None,
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
|
||||
] if len(name) > 2 else None,
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
@ -269,12 +269,12 @@ def process_article(title, article_url):
|
||||
matches = matches.split("<p>")
|
||||
for match in matches:
|
||||
match = re.sub(r'<[^>]+>', '', match)
|
||||
match = match.lstrip("\\n ").rstrip("\\n ").strip()
|
||||
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
||||
if match_type == 0:
|
||||
match = match.split("\\n")
|
||||
else:
|
||||
match = match.split("\n")
|
||||
match = [element.replace('email: ', '') for element in match]
|
||||
match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
|
||||
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||||
|
||||
# Data processing
|
||||
@ -291,8 +291,8 @@ def process_article(title, article_url):
|
||||
"from_article": article_id,
|
||||
"firstname": name[0],
|
||||
"lastname": name[-1],
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
|
||||
name) > 2 else None,
|
||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
|
||||
] if len(name) > 2 else None,
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejde_save.Transf()
|
||||
# ejde_save.delete()
|
||||
ejde_save.delete()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user