diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 3c9c71a..53228d4 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -152,6 +152,11 @@ def process_article(title, article_url): article_soup = BeautifulSoup(html, 'html.parser') article_text = article_soup.get_text() + # Extract title if title == None + if not title: + title_match = re.search(r"

(.*?)

", article_text) + title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None + # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None