diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index ec9cfee..2563557 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' - Total number of papers: 2023/08/08 - 4300 - Total Time via VPN w/100ms-delay: 254.04s + Total number of papers: 2023/08/08 - 4339 + Total Time via VPN w/52ms-delay: 430.38s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -68,15 +68,17 @@ def process_volume(url): baseWeb = url[:url.rfind('/')] + "/" html = volume_response.text volume_soup = BeautifulSoup(html, "html.parser") - ol_elements = volume_soup.find_all('ol') + li_elements = volume_soup.find_all('ol') + if not li_elements: + li_elements = volume_soup.find_all('ul') - for ol in ol_elements: - em_elements = ol.find_all('em') + for li in li_elements: + em_elements = li.find_all('em') if em_elements: articles.extend(em for em in em_elements) # Another html style else: - i_elements = ol.find_all('i') + i_elements = li.find_all('i') if i_elements: articles.extend(i for i in i_elements) else: @@ -157,21 +159,26 @@ def process_article(title, article_url): title_match = re.search(r"

(.*?)

", article_text) title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None + # Extract issue + issue_match = re.search(r'No\. (\d+)', article_text) + issue = issue_match.group(1) if issue_match else None + # Extract volume - volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) + volume_match = re.search(r'Vol\. (\d+)', article_text) volume = str(volume_match.group(1)) if volume_match else None if not volume: - volume_match = re.search(r'Vol\. (\d+)', article_text) - volume = str(volume_match.group(1)) if volume_match else None + volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text) + if volume_match: + issue_number, volume = volume_match.groups() + volume = str(volume) + issue = "Special Issue " + str(issue_number) + else: + volume = None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) pp = pp_match.group(1) if pp_match else None - # Extract issue - issue_match = re.search(r'No\. (\d+)', article_text) - issue = issue_match.group(1) if issue_match else None - # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) submitted_date = match.group(1) if match else None @@ -207,7 +214,7 @@ def process_article(title, article_url): keywords = [] # Extract DOI - doi_match = re.search(r'DOI: (.+)(?=<)', html) + doi_match = re.search(r'DOI: ([^\t\n<]+)', html) if not doi_match: doi_match = re.search(r'DOI: (.+)', html) doi = doi_match.group(1) if doi_match else None @@ -222,18 +229,33 @@ def process_article(title, article_url): for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: + if "email:" in cell: + cell = cell.split("email:") + email_list = str(cell[1]).split(',') + cell = cell[0] + elif "e-mail:" in cell: + cell = cell.split("e-mail:") + email_list = str(cell[1]).split(',') + cell = cell[0] + else: + email_list = None + cell = re.split(r'[\r\n]+', cell) - cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell] + cell = [c.replace('\\newline', '') for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing if cell[0]: authors.append(cell[0]) name = re.split(r'[ .]', cell[0]) - affiliation = ', '.join(cell[1:-1]) - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) - email = email_match.group() if email_match else None + affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) + affiliation = affiliation.lstrip(",").rstrip(",").strip() + emails = [] + if email_list: + for email in email_list: + email_match = re.search(r'[\w.-]+@[\w.-]+', email) + emails.append(email_match.group()) if email_match else None author_data = { "author_id": str(uuid.uuid4()), @@ -244,7 +266,7 @@ def process_article(title, article_url): "affiliation": [{ "year": volume, "affiliation": affiliation, - "email": email + "email": emails }] } authorData.append(author_data) @@ -267,23 +289,38 @@ def process_article(title, article_url): matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.split("

") for match in matches: + if "email:" in match: + match = match.split("email:") + email_list = str(match[1]).split(',') + match = match[0] + elif "e-mail:" in match: + match = match.split("e-mail:") + email_list = str(match[1]).split(',') + match = match[0] + else: + email_list = None + match = re.sub(r'<[^>]+>', '', match) match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() if match_type == 0: match = match.split("\\n") else: match = match.split("\n") - match = [m.replace('email: ', '').replace('\\newline', '') for m in match] + match = [m.replace('\\newline', '') for m in match] match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing if match[0]: authors.append(match[0]) name = re.split(r'[ .]', match[0]) - affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) - email = email_match.group() if email_match else None + affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) + affiliation = affiliation.lstrip(",").rstrip(",").strip() + emails = [] + if email_list: + for email in email_list: + email_match = re.search(r'[\w.-]+@[\w.-]+', email) + emails.append(email_match.group()) if email_match else None author_data = { "author_id": str(uuid.uuid4()), @@ -294,7 +331,7 @@ def process_article(title, article_url): "affiliation": [{ "year": volume, "affiliation": affiliation, - "email": email + "email": emails }] } authorData.append(author_data) @@ -339,14 +376,21 @@ def process_article(title, article_url): start_time = time.time() +url_list = [] + +# Get all general volumes url index = "https://ejde.math.txstate.edu/indexleft.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') - -# Find all the URL links under the first Volume section volume_links = soup.select('font > a[href]') -# Extract and store the URLs in a list using list comprehension -url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1] +url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]) + +# Get all special issues url +index = "https://ejde.math.txstate.edu/special-toc.html" +response = requests.get(index) +soup = BeautifulSoup(response.content, 'html.parser') +special_links = soup.find_all("a", href=True) +url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]]) # Initialize lists authorData = [] @@ -361,7 +405,7 @@ authorNum = 0 articleNum = 0 batch_size = 100 # Number of articles to process before saving -executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads +executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_volume, url) for url in url_list] @@ -393,12 +437,12 @@ while failedData: if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") - print("Total fetched paper:", len(articleData) + articleNum) +print("Total fetched paper:", len(articleData) + articleNum) if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") - print("Total fetched author:", len(authorData) + authorNum) +print("Total fetched author:", len(authorData) + authorNum) # Save error record if len(totallyFailedData) > 0: @@ -418,4 +462,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -ejde_save.delete() \ No newline at end of file +ejde_save.delete()