diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index d13393f..bd8429c 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -196,11 +196,11 @@ def process_article(title, article_url):
msc = None
# Extract KeyWords
- keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
+ keywords_match = re.search(r'Key Words: (.*?)(?:
|
|$)', html, re.DOTALL)
if not keywords_match:
- keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL)
+ keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if keywords_match:
- keywords = keywords_match.group(1).strip().replace('\n', '')
+ keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else: