Optimization:
strip "\newline" in author name
This commit is contained in:
parent
ed469ee362
commit
083e6c87eb
@ -223,7 +223,7 @@ def process_article(title, article_url):
|
|||||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
cell = re.split(r'[\r\n]+', cell)
|
cell = re.split(r'[\r\n]+', cell)
|
||||||
cell = [element.replace('email: ', '') for element in cell]
|
cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
|
||||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
@ -240,8 +240,8 @@ def process_article(title, article_url):
|
|||||||
"from_article": article_id,
|
"from_article": article_id,
|
||||||
"firstname": name[0],
|
"firstname": name[0],
|
||||||
"lastname": name[-1],
|
"lastname": name[-1],
|
||||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
|
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
|
||||||
name) > 2 else None,
|
] if len(name) > 2 else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
@ -269,12 +269,12 @@ def process_article(title, article_url):
|
|||||||
matches = matches.split("<p>")
|
matches = matches.split("<p>")
|
||||||
for match in matches:
|
for match in matches:
|
||||||
match = re.sub(r'<[^>]+>', '', match)
|
match = re.sub(r'<[^>]+>', '', match)
|
||||||
match = match.lstrip("\\n ").rstrip("\\n ").strip()
|
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
||||||
if match_type == 0:
|
if match_type == 0:
|
||||||
match = match.split("\\n")
|
match = match.split("\\n")
|
||||||
else:
|
else:
|
||||||
match = match.split("\n")
|
match = match.split("\n")
|
||||||
match = [element.replace('email: ', '') for element in match]
|
match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
|
||||||
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
@ -291,8 +291,8 @@ def process_article(title, article_url):
|
|||||||
"from_article": article_id,
|
"from_article": article_id,
|
||||||
"firstname": name[0],
|
"firstname": name[0],
|
||||||
"lastname": name[-1],
|
"lastname": name[-1],
|
||||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
|
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
|
||||||
name) > 2 else None,
|
] if len(name) > 2 else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
|||||||
|
|
||||||
# Transfer to large file and delete the temporary storage files
|
# Transfer to large file and delete the temporary storage files
|
||||||
ejde_save.Transf()
|
ejde_save.Transf()
|
||||||
# ejde_save.delete()
|
ejde_save.delete()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user