diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index d13393f..bd8429c 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -196,11 +196,11 @@ def process_article(title, article_url): msc = None # Extract KeyWords - keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?:
|

|$)', html, re.DOTALL) if not keywords_match: - keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) if keywords_match: - keywords = keywords_match.group(1).strip().replace('\n', '') + keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] else: