From ed469ee362cef9708f37a54526815a2075276875 Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 19:52:40 +0800
Subject: [PATCH] Bug Fix: 1. reformat regular expressions for keyword matching
---
01_EJDE_spider/ejde_main.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index d13393f..bd8429c 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -196,11 +196,11 @@ def process_article(title, article_url):
msc = None
# Extract KeyWords
- keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
+ keywords_match = re.search(r'Key Words: (.*?)(?:
|
|$)', html, re.DOTALL)
if not keywords_match:
- keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL)
+ keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if keywords_match:
- keywords = keywords_match.group(1).strip().replace('\n', '')
+ keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else: