diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 4f8cf0e..6f91237 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -1,8 +1,10 @@ +import time import uuid import requests import re import ejde_save +from retrying import retry from datetime import datetime from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed @@ -10,6 +12,9 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' + Total number of papers: + 2023/08/08 - 4300 + ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 @@ -18,42 +23,141 @@ from concurrent.futures import ThreadPoolExecutor, as_completed def datetime_transform(date): + month_typo = { + "Janaury": "January", + "Febrary": "February", + "Februay": "February", + "Mar": "March", + "Mach": "March", + "Match": "March", + "Maay": "May", + "Jun": "June", + "Juy": "July", + "Aapril": "April", + "Spetember": "September", + "Septembere": "September", + "Ocotber": "October", + } try: input_date = datetime.strptime(date, "%B %d, %Y") return input_date.strftime("%Y-%m-%d") - # handle two month-typos - except ValueError as vale: - if "Match 7, 2012" in date: - return "2012-03-07" - elif "Janaury 15, 2021" in date: - return "2021-01-15" - else: - print("Month typo:", str(vale)) + except ValueError: + for typo, correction in month_typo.items(): + date = date.replace(typo, correction) + + try: + input_date = datetime.strptime(date, "%B %d, %Y") + return input_date.strftime("%Y-%m-%d") + except ValueError as val_err: + print("TYPO:", str(val_err)) return date # Article and author detail -def process_article(url): - response = requests.get(url) - response.raise_for_status() +def process_volume(url): + articles = [] + baseWeb = None - baseWeb = url[:url.rfind('/')] + "/" - html = response.text - soup = BeautifulSoup(html, "html.parser") + retries = 5 + for attempt in range(retries): + try: + volume_response = requests.get(url) + if volume_response.status_code == 200: + volume_response.raise_for_status() - articles = soup.find_all("li") + baseWeb = url[:url.rfind('/')] + "/" + html = volume_response.text + volume_soup = BeautifulSoup(html, "html.parser") + ol_elements = volume_soup.find_all('ol') - for article in articles: - authors = article.find("strong").text.strip().split(", ") - title = article.find("em").text.strip() - article_url = baseWeb + article.find("a")["href"] + for ol in ol_elements: + em_elements = ol.find_all('em') + if em_elements: + articles.extend(em for em in em_elements) + # Another html style + else: + i_elements = ol.find_all('i') + if i_elements: + articles.extend(i for i in i_elements) + else: + print("HTML FORMAT FAILURE:", url) + return + break + except Exception as fetch_err: + if attempt < retries - 1: + print("RETRYING TO FETCH HTML:", str(fetch_err)) + time.sleep(1) + continue + else: + print("HTML FETCHING FAILURE:", url) + fail = { + "website": url, + } + failedVolData.append(fail) + return - # Access article detail page - response = requests.get(article_url) - html = response.text - soup = BeautifulSoup(html, 'html.parser') + # Process each article using multithreading + volume_executor = ThreadPoolExecutor(max_workers=15) + volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles] - article_text = soup.get_text() + # Wait for all tasks to complete + for volume_futures in as_completed(volume_futures): + try: + volume_futures.result() + except Exception as html_err: + print("HTML PROCESSING ERROR:", str(html_err)) + + +def process_html_article(baseweb, article): + global articleNum, authorNum + # Get article title & url + try: + title = article.text.strip() + title = re.sub(r'\s+', ' ', title).strip() + article_url = baseweb + article.find_next("a")["href"] + except Exception as html_format_err: + print("HTML FORMAT FAILURE:", str(html_format_err)) + fail = { + "article": str(article), + } + failedFormatData.append(fail) + return + + # Crawl article data + try: + process_article(title, article_url) + except Exception as article_err: + print("ARTICLE PROCESSING FAILURE:", str(article_err)) + fail = { + "title": title, + "URL": article_url, + } + failedData.append(fail) + return + + # Save the data periodically based on batch size + if len(articleData) % batch_size == 0: + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") + articleNum += len(articleData) + articleData.clear() + + if len(authorData) % batch_size == 0: + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") + authorNum += len(authorData) + authorData.clear() + + +@retry(wait_fixed=5000, stop_max_attempt_number=5) +def process_article(title, article_url): + headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} + article_response = requests.get(article_url, headers=headers) + if article_response.status_code == 200: + article_response.raise_for_status() + + html = article_response.text + article_soup = BeautifulSoup(html, 'html.parser') + article_text = article_soup.get_text() # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) @@ -69,26 +173,22 @@ def process_article(url): # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) - submitted_date = match.group(1) - if match: + submitted_date = match.group(1) if match else None + if submitted_date: submitted_date = datetime_transform(submitted_date) - else: - submitted_date = None # Extract publication date match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) - publish_date = match.group(1) - if match: + publish_date = match.group(1) if match else None + if publish_date: publish_date = datetime_transform(publish_date) - else: - publish_date = None # Extract MSC msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) if not msc_match: msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) if msc_match: - msc = msc_match.group(1).strip().strip('.') + msc = msc_match.group(1).strip().strip('.').strip() msc = re.split(r', |;', msc) else: msc = None @@ -100,7 +200,7 @@ def process_article(url): if keywords_match: keywords = keywords_match.group(1).strip().replace('\n', '') keywords = re.split(r', |;', keywords) - keywords = [keyword.strip().strip('.') for keyword in keywords] + keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] else: keywords = None @@ -113,6 +213,79 @@ def process_article(url): # Article_id article_id = str(uuid.uuid4()) + # Author info + authors = [] + table = article_soup.find('table') + if table: + for row in table.find_all('tr'): + cells = [cell.text.strip() for cell in row.find_all('td')] + for cell in cells: + cell = cell.split("\n") + cell = [element.replace('email: ', '') for element in cell] + cell = [c.strip() for c in cell] + + # Data processing + authors.append(cell[0]) + name = cell[0].split(" ") + affiliation = ', '.join(cell[1:-1]) + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) + email = email_match.group() if email_match else None + + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": name[0], + "lastname": name[-1], + "middlename": name[1:len(name) - 1] if len(name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email, + }] + } + authorData.append(author_data) + # If no author table + else: + pattern = r'
") + + for match in matches: + match = re.sub(r'<[^>]+>', '', match) + match = match.lstrip("\\n ").rstrip("\\n ").strip() + match = match.split("\\n") + match = [element.replace('email: ', '') for element in match] + match = [m.strip() for m in match] + + # Data processing + authors.append(match[0]) + name = match[0].split(" ") + affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) + email = email_match.group() if email_match else None + + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": name[0], + "lastname": name[-1], + "middlename": name[1:len(name) - 1] if len(name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email, + }] + } + authorData.append(author_data) + else: + print("AUTHOR SEARCHING ERROR:", article_url) + return + + # Article info article_data = { "article_id": article_id, "title": title, @@ -132,79 +305,79 @@ def process_article(url): } articleData.append(article_data) - # Author info - table = soup.find('table') - for row in table.find_all('tr'): - cells = [cell.text.strip() for cell in row.find_all('td')] - for cell in cells: - cell = cell.split("\n") - cell = [element.replace('email: ', '') for element in cell] - cell = [c.strip() for c in cell] - - # Data processing - name = cell[0].split(" ") - affiliation = ', '.join(cell[1:-1]) - email = cell[-1] - - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": name[1:len(name) - 1] if len(name) > 2 else None, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email, - }] - } - authorData.append(author_data) - - # Save the data periodically based on batch size - if len(articleData) % batch_size == 0: - ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") - articleData.clear() - - if len(authorData) % batch_size == 0: - ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") - authorData.clear() - index = "https://ejde.math.txstate.edu/indexleft.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') -# Find all the URL links under the first (Volumes) section +# Find all the URL links under the first Volume section volume_links = soup.select('font > a[href]') - # Extract and store the URLs in a list using list comprehension url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1] +# Initialize lists authorData = [] articleData = [] +failedData = [] +totallyFailedData = [] +failedVolData = [] +failedFormatData = [] + +# Initialize variables for counting +authorNum = 0 +articleNum = 0 batch_size = 100 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads # Process each URL using multithreading -futures = [executor.submit(process_article, url) for url in url_list] +futures = [executor.submit(process_volume, url) for url in url_list] # Wait for all tasks to complete for future in as_completed(futures): try: future.result() - except Exception as e: - print("An error occurred:", str(e)) + except Exception as vol_err: + print("VOLUME PROCESSING ERROR:", str(vol_err)) + +# Retry failed processing paper +for data in failedData: + articleTitle = data["title"] + articleUrl = data["URL"] + try: + process_article(articleTitle, articleUrl) + except Exception as retry_err: + print("ARTICLE RETRYING FAILURE:", str(retry_err)) + totally_fail = { + "title": articleTitle, + "URL": articleUrl, + } + totallyFailedData.append(totally_fail) # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") + print("Total fetched paper:", len(articleData) + articleNum) if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") + print("Total fetched author:", len(authorData) + authorNum) + +# Save error record +if len(totallyFailedData) > 0: + ejde_save.save_data(failedData, "", "Failed_article_record.json") + print("Total failed processing paper:", len(totallyFailedData)) + +if len(failedVolData) > 0: + ejde_save.save_data(failedVolData, "", "Failed_volume_record.json") + print("Total failed fetching volume:", len(failedVolData)) + +if len(failedFormatData) > 0: + ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") + print("Total failed searching article:", len(failedFormatData)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -ejde_save.delete() +# ejde_save.delete()