1. reformat regular expressions for keyword matching
This commit is contained in:
ldy 2023-08-11 19:52:40 +08:00
parent b1eba69085
commit ed469ee362

View File

@ -196,11 +196,11 @@ def process_article(title, article_url):
msc = None msc = None
# Extract KeyWords # Extract KeyWords
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
if not keywords_match: if not keywords_match:
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL) keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if keywords_match: if keywords_match:
keywords = keywords_match.group(1).strip().replace('\n', '') keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords) keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else: else: