import time import uuid import requests import re import ejde_save from retrying import retry from datetime import datetime from bs4 import BeautifulSoup from unidecode import unidecode from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' Total number of papers: 2023/08/08 - 4761 Total Time via VPN w/100ms-delay: 306.73s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) ''' def datetime_transform(date): month_typo = { "Janaury": "January", "Febrary": "February", "Februay": "February", "Mar": "March", "Mach": "March", "Match": "March", "Maay": "May", "Jun": "June", "Juy": "July", "Aapril": "April", "Spetember": "September", "Septembere": "September", "Ocotber": "October", "Nobember": "November", } try: input_date = datetime.strptime(date, "%B %d, %Y") return input_date.strftime("%Y-%m-%d") except ValueError: for typo, correction in month_typo.items(): date = date.replace(typo, correction) try: input_date = datetime.strptime(date, "%B %d, %Y") return input_date.strftime("%Y-%m-%d") except ValueError as val_err: print("TYPO:", str(val_err)) return date # Article and author detail def process_volume(url): articles = [] baseWeb = None retries = 5 for attempt in range(retries): try: volume_response = requests.get(url) if volume_response.status_code == 200: volume_response.raise_for_status() baseWeb = url[:url.rfind('/')] + "/" html = volume_response.text volume_soup = BeautifulSoup(html, "html.parser") li_elements = volume_soup.find_all('ol') if not li_elements: li_elements = volume_soup.find_all('ul') for li in li_elements: em_elements = li.find_all('em') if em_elements: articles.extend(em for em in em_elements) # Another html style else: i_elements = li.find_all('i') if i_elements: articles.extend(i for i in i_elements) else: print("HTML FORMAT FAILURE:", url) fail = { "website": url } failedFormatData.append(fail) return break except Exception as fetch_err: if attempt < retries - 1: print("RETRYING TO FETCH HTML:", str(fetch_err)) time.sleep(1) continue else: print("HTML FETCHING FAILURE:", url) fail = { "website": url } failedVolData.append(fail) return # Process each article using multithreading (>20 threads would cause more error) volume_executor = ThreadPoolExecutor(max_workers=15) volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles] # Wait for all tasks to complete for volume_futures in as_completed(volume_futures): try: volume_futures.result() except Exception as html_err: print("HTML PROCESSING ERROR:", str(html_err)) def process_html_article(baseweb, article): # Get article title & url try: title = article.text.strip() title = re.sub(r'\s+', ' ', title).strip() article_url = baseweb + article.find_next("a")["href"] if "../../index.html" in article_url: print("Redundant URL:", article_url) return except Exception as html_format_err: print("HTML FORMAT FAILURE:", str(html_format_err)) fail = { "article": str(article) } failedFormatData.append(fail) return # Crawl article data try: process_article(title, article_url) except Exception as article_err: print("ARTICLE PROCESSING FAILURE:", str(article_err)) fail = { "title": title, "URL": article_url } failedData.append(fail) return @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(title, article_url): global articleNum, authorNum headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} article_response = requests.get(article_url, headers=headers) if article_response.status_code == 200: article_response.raise_for_status() html = article_response.text article_soup = BeautifulSoup(html, 'html.parser') article_text = article_soup.get_text() # Extract title if title == None if not title: title_match = re.search(r"

(.*?)

", article_text) title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None # Extract issue issue_match = re.search(r'No\. (\d+)', article_text) issue = issue_match.group(1) if issue_match else None # Extract volume volume_match = re.search(r'Vol\. (\d+)', article_text) volume = str(volume_match.group(1)) if volume_match else None if not volume: volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text) if volume_match: issue_number, volume = volume_match.groups() volume = str(volume) issue = "Special Issue " + str(issue_number) else: volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text) if volume_match: issue = "Conference " + str(volume_match.group(1)) volume = str(volume_match.group(2)) else: volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text) if volume_match: issue_number, volume = volume_match.groups() volume = str(volume) issue = "Conference " + str(issue_number) else: volume = None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) pp = pp_match.group(1) if pp_match else None # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) submitted_date = match.group(1) if match else None if submitted_date: submitted_date = datetime_transform(submitted_date) # Extract publication date match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) publish_date = match.group(1) if match else None if publish_date: publish_date = datetime_transform(publish_date) # Extract MSC msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) if not msc_match: msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) if msc_match: msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip()) msc = msc.strip('.').strip() msc = re.split(r', |;', msc) else: msc = [] # Extract KeyWords keywords_match = re.search(r'Key Words: (.*?)(?:
|

|$)', html, re.DOTALL) if not keywords_match: keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) if keywords_match: keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in keywords if len(keyword.strip())] else: keywords = [] # Extract DOI doi_match = re.search(r'DOI: ([^\t\n<]+)', html) if not doi_match: doi_match = re.search(r'DOI: (.+)', html) doi = doi_match.group(1) if doi_match else None doi = doi.replace('https://doi.org/', '') # strip doi website header # Article_id article_id = str(uuid.uuid4()) # Author info authors = [] table = article_soup.find('table') if table: for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: if "email:" in cell: cell = cell.split("email:") email_list = str(cell[1]).split(',') cell = cell[0] elif "e-mail:" in cell: cell = cell.split("e-mail:") email_list = str(cell[1]).split(',') cell = cell[0] else: email_list = None cell = re.split(r'[\r\n]+', cell) cell = [c.replace('\\newline', '') for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing if cell[0]: authors.append(unidecode(cell[0])) name = re.split(r'[ .]', cell[0]) affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) affiliation = affiliation.lstrip(",").rstrip(",").strip() emails = [] if email_list: for email in email_list: email_match = re.search(r'[\w.-]+@[\w.-]+', email) emails.append(unidecode(email_match.group())) if email_match else None author_data = { "author_id": str(uuid.uuid4()), "from_article": [article_id], "first_name": unidecode(name[0]), "last_name": unidecode(name[-1]), "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None, "affiliation": [{ "year": volume, "affiliation": unidecode(affiliation), "email": emails }] } authorData.append(author_data) authorNum += 1 # If no author table else: match_type = 0 hr_count = len(article_soup.find_all('hr')) if hr_count < 3: pattern = r'


(.*?)
' else: pattern = r'
(?:.*
)(.*)(?=
)' matches = str(re.findall(pattern, html, re.DOTALL)) if len(matches) < 5: match_type = 1 last_p_tag = str(article_soup.find_all('p')[-1]) pattern = r'

(.*?)


' matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip() if matches: matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.split("

") for match in matches: if "email:" in match: match = match.split("email:") email_list = str(match[1]).split(',') match = match[0] elif "e-mail:" in match: match = match.split("e-mail:") email_list = str(match[1]).split(',') match = match[0] else: email_list = None match = re.sub(r'<[^>]+>', '', match) match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() if match_type == 0: match = match.split("\\n") else: match = match.split("\n") match = [m.replace('\\newline', '') for m in match] match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing if match[0]: authors.append(unidecode(match[0])) name = re.split(r'[ .]', match[0]) affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) affiliation = affiliation.lstrip(",").rstrip(",").strip() emails = [] if email_list: for email in email_list: email_match = re.search(r'[\w.-]+@[\w.-]+', email) emails.append(unidecode(email_match.group())) if email_match else None author_data = { "author_id": str(uuid.uuid4()), "from_article": [article_id], "first_name": unidecode(name[0]), "last_name": unidecode(name[-1]), "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None, "affiliation": [{ "year": volume, "affiliation": unidecode(affiliation), "email": emails }] } authorData.append(author_data) authorNum += 1 else: print("AUTHOR SEARCHING ERROR:", article_url) fail = { "title": title, "URL": article_url } failedFormatData.append(fail) # Article info article_data = { "article_id": article_id, "title": unidecode(title), "authors": authors, "corresponding_authors": None, "submit_datetime": submitted_date, "publish_datetime": publish_date, "keywords": keywords, "MSC": msc, "URL": article_url, "DOI": doi, "publisher": "Texas State University", "journal": "Electronic Journal of Differential Equations", "volume": volume, "issue": issue, "page": pp } articleData.append(article_data) articleNum += 1 # Save the data periodically based on batch size if len(articleData) % batch_size == 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") articleData.clear() if len(authorData) % batch_size == 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") authorData.clear() start_time = time.time() url_list = [] # Get all general volumes url index = "https://ejde.math.txstate.edu/indexleft.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') volume_links = soup.select('font > a[href]') url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]) # Get all special issues url index = "https://ejde.math.txstate.edu/special-toc.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') special_links = soup.find_all("a", href=True) url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]]) # Get all conference special issues url index = "https://ejde.math.txstate.edu/conf-toc.html#latest" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') special_links = soup.find_all("a", href=True) url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]]) authorData = [] articleData = [] failedData = [] totallyFailedData = [] failedVolData = [] failedFormatData = [] authorNum = 0 articleNum = 0 batch_size = 100 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_volume, url) for url in url_list] # Wait for all tasks to complete for future in as_completed(futures): try: future.result() except Exception as vol_err: print("VOLUME PROCESSING ERROR:", str(vol_err)) # Retry failed processing paper print("START RETRYING:", len(failedData)) while failedData: data = failedData.pop(0) articleTitle = data["title"] articleUrl = data["URL"] try: process_article(articleTitle, articleUrl) except Exception as retry_err: print("ARTICLE RETRYING FAILURE:", str(retry_err)) totally_fail = { "title": articleTitle, "URL": articleUrl } totallyFailedData.append(totally_fail) # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") # Save error record if len(totallyFailedData) > 0: ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json") print("Total failed fetching volume:", len(failedVolData)) if len(failedFormatData) > 0: ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) # Statistics print("Total fetched paper:", articleNum) print("Total fetched author:", authorNum) print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() ejde_save.delete()