From ed469ee362cef9708f37a54526815a2075276875 Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 19:52:40 +0800 Subject: [PATCH] Bug Fix: 1. reformat regular expressions for keyword matching --- 01_EJDE_spider/ejde_main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index d13393f..bd8429c 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -196,11 +196,11 @@ def process_article(title, article_url): msc = None # Extract KeyWords - keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?:
|

|$)', html, re.DOTALL) if not keywords_match: - keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) if keywords_match: - keywords = keywords_match.group(1).strip().replace('\n', '') + keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] else: