Bug Fix:
1. reformat regular expressions for keyword matching
This commit is contained in:
parent
b1eba69085
commit
ed469ee362
@ -196,11 +196,11 @@ def process_article(title, article_url):
|
|||||||
msc = None
|
msc = None
|
||||||
|
|
||||||
# Extract KeyWords
|
# Extract KeyWords
|
||||||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
|
||||||
if not keywords_match:
|
if not keywords_match:
|
||||||
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||||||
if keywords_match:
|
if keywords_match:
|
||||||
keywords = keywords_match.group(1).strip().replace('\n', '')
|
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||||||
keywords = re.split(r', |;', keywords)
|
keywords = re.split(r', |;', keywords)
|
||||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user