diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 4ae686f..f36a618 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -7,13 +7,13 @@ import ejde_save from retrying import retry from datetime import datetime from bs4 import BeautifulSoup -from concurrent.futures import ThreadPoolExecutor, as_completed, wait +from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' Total number of papers: 2023/08/08 - 4300 - Total Time via VPN w/119ms-delay: 441.80s + Total Time via VPN w/100ms-delay: 254.04s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -152,9 +152,17 @@ def process_article(title, article_url): article_soup = BeautifulSoup(html, 'html.parser') article_text = article_soup.get_text() + # Extract title if title == None + if not title: + title_match = re.search(r"

(.*?)

", article_text) + title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None + # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None + if not volume: + volume_match = re.search(r'Vol\. (\d+)', article_text) + volume = str(volume_match.group(1)) if volume_match else None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) @@ -181,17 +189,18 @@ def process_article(title, article_url): if not msc_match: msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) if msc_match: - msc = msc_match.group(1).strip().strip('.').strip() + msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip()) + msc = msc.strip('.').strip() msc = re.split(r', |;', msc) else: msc = [] # Extract KeyWords - keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?:
|

|$)', html, re.DOTALL) if not keywords_match: - keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) if keywords_match: - keywords = keywords_match.group(1).strip().replace('\n', '') + keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] else: @@ -213,72 +222,89 @@ def process_article(title, article_url): for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: - cell = cell.split("\n") - cell = [element.replace('email: ', '') for element in cell] - cell = [c.strip() for c in cell] + cell = re.split(r'[\r\n]+', cell) + cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell] + cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing - authors.append(cell[0]) - name = cell[0].split(" ") - middle_name = ''.join(name[1:-1]) if name[1:-1] else None - affiliation = ', '.join(cell[1:-1]) - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) - email = email_match.group() if email_match else None + if cell[0]: + authors.append(cell[0]) + name = re.split(r'[ .]', cell[0]) + affiliation = ', '.join(cell[1:-1]) + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": [article_id], - "first_name": name[0], - "last_name": name[-1], - "middle_name": middle_name, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "first_name": name[0], + "last_name": name[-1], + "middle_name": ''.join(name[1:-1]) if name[1:-1] else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) # If no author table else: - pattern = r'


(.*?)
' + match_type = 0 + hr_count = len(article_soup.find_all('hr')) + if hr_count < 3: + pattern = r'
(.*?)
' + else: + pattern = r'
(?:.*
)(.*)(?=
)' matches = str(re.findall(pattern, html, re.DOTALL)) + if len(matches) < 5: + match_type = 1 + last_p_tag = str(article_soup.find_all('p')[-1]) + pattern = r'

(.*?)


' + matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip() + if matches: matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.split("

") - for match in matches: match = re.sub(r'<[^>]+>', '', match) - match = match.lstrip("\\n ").rstrip("\\n ").strip() - match = match.split("\\n") - match = [element.replace('email: ', '') for element in match] - match = [m.strip() for m in match] + match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() + if match_type == 0: + match = match.split("\\n") + else: + match = match.split("\n") + match = [m.replace('email: ', '').replace('\\newline', '') for m in match] + match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing - authors.append(match[0]) - name = match[0].split(" ") - middle_name = ''.join(name[1:-1]) if name[1:-1] else None - affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) - email = email_match.group() if email_match else None + if match[0]: + authors.append(match[0]) + name = re.split(r'[ .]', match[0]) + affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": [article_id], - "first_name": name[0], - "last_name": name[-1], - "middle_name": middle_name, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "first_name": name[0], + "last_name": name[-1], + "middle_name": ''.join(name[1:-1]) if name[1:-1] else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) else: print("AUTHOR SEARCHING ERROR:", article_url) - return + fail = { + "title": title, + "URL": article_url + } + failedFormatData.append(fail) # Article info article_data = { @@ -347,8 +373,6 @@ for future in as_completed(futures): except Exception as vol_err: print("VOLUME PROCESSING ERROR:", str(vol_err)) -wait(futures) - # Retry failed processing paper print("START RETRYING:", len(failedData)) while failedData: @@ -378,15 +402,15 @@ if len(authorData) > 0: # Save error record if len(totallyFailedData) > 0: - ejde_save.save_data(failedData, "", "Failed_article_record.json") + ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: - ejde_save.save_data(failedVolData, "", "Failed_volume_record.json") + ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json") print("Total failed fetching volume:", len(failedVolData)) if len(failedFormatData) > 0: - ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") + ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) # Total running time @@ -394,4 +418,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -ejde_save.delete() +ejde_save.delete() \ No newline at end of file