Bug Fix:
1. reformat regular expressions for keyword matching
This commit is contained in:
parent
b1eba69085
commit
ed469ee362
@ -196,11 +196,11 @@ def process_article(title, article_url):
|
||||
msc = None
|
||||
|
||||
# Extract KeyWords
|
||||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||||
keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
|
||||
if not keywords_match:
|
||||
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
||||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||||
if keywords_match:
|
||||
keywords = keywords_match.group(1).strip().replace('\n', '')
|
||||
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||||
keywords = re.split(r', |;', keywords)
|
||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user