diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 86b8b68..3c9c71a 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed 爬取网站:'ejde.math.txstate.edu' Total number of papers: 2023/08/08 - 4300 - Total Time via VPN w/119ms-delay: 441.80s + Total Time via VPN w/100ms-delay: 254.04s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -156,7 +156,8 @@ def process_article(title, article_url): volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None if not volume: - volume = str(volume_match.group(2)) if volume_match else None + volume_match = re.search(r'Vol\. (\d+)', article_text) + volume = str(volume_match.group(1)) if volume_match else None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) @@ -183,7 +184,8 @@ def process_article(title, article_url): if not msc_match: msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) if msc_match: - msc = msc_match.group(1).strip().strip('.').strip() + msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip()) + msc = msc.strip('.').strip() msc = re.split(r', |;', msc) else: msc = None @@ -217,68 +219,85 @@ def process_article(title, article_url): for cell in cells: cell = cell.split("\n") cell = [element.replace('email: ', '') for element in cell] - cell = [c.strip() for c in cell] + cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing - authors.append(cell[0]) - name = cell[0].split(" ") - affiliation = ', '.join(cell[1:-1]) - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) - email = email_match.group() if email_match else None + if cell[0]: + authors.append(cell[0]) + name = cell[0].split(" ") + affiliation = ', '.join(cell[1:-1]) + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": name[0], + "lastname": name[-1], + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( + name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) # If no author table else: + match_type = 0 pattern = r'
(.*?)
' matches = str(re.findall(pattern, html, re.DOTALL)) + if len(matches) < 5: + match_type = 1 + last_p_tag = str(article_soup.find_all('p')[-1]) + pattern = r'

(.*?)


' + matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip() + if matches: matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.split("

") - for match in matches: match = re.sub(r'<[^>]+>', '', match) match = match.lstrip("\\n ").rstrip("\\n ").strip() - match = match.split("\\n") + if match_type == 0: + match = match.split("\\n") + else: + match = match.split("\n") match = [element.replace('email: ', '') for element in match] - match = [m.strip() for m in match] + match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing - authors.append(match[0]) - name = match[0].split(" ") - affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) - email = email_match.group() if email_match else None + if match[0]: + authors.append(match[0]) + name = match[0].split(" ") + affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": name[0], + "lastname": name[-1], + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( + name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) else: print("AUTHOR SEARCHING ERROR:", article_url) - return + fail = { + "title": title, + "URL": article_url + } + failedFormatData.append(fail) # Article info article_data = { @@ -376,7 +395,7 @@ if len(authorData) > 0: # Save error record if len(totallyFailedData) > 0: - ejde_save.save_data(failedData, "Error", "Failed_article_record.json") + ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: