import uuid import requests import re import ejde_save from datetime import datetime from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) ''' def datetime_transform(date): input_date = datetime.strptime(date, "%B %d, %Y") return input_date.strftime("%Y-%m-%d") # Article and author detail def process_article(url): response = requests.get(url) response.raise_for_status() baseWeb = url[:url.rfind('/')] + "/" html = response.text soup = BeautifulSoup(html, "html.parser") articles = soup.find_all("li") for article in articles: authors = article.find("strong").text.strip().split(", ") title = article.find("em").text.strip() article_url = baseWeb + article.find("a")["href"] # Access article detail page response = requests.get(article_url) html = response.text soup = BeautifulSoup(html, 'html.parser') article_text = soup.get_text() # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) pp = pp_match.group(1) if pp_match else None # Extract issue issue_match = re.search(r'No\. (\d+)', article_text) issue = issue_match.group(1) if issue_match else None # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) submitted_date = match.group(1) if match: submitted_date = datetime_transform(submitted_date) else: submitted_date = None # Extract publication date match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) publish_date = match.group(1) if match: publish_date = datetime_transform(publish_date) else: publish_date = None # Extract MSC msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) if not msc_match: msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) if msc_match: msc = msc_match.group(1).strip().strip('.') msc = re.split(r', |;', msc) else: msc = None # Extract KeyWords keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) if not keywords_match: keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL) if keywords_match: keywords = keywords_match.group(1).strip().replace('\n', '') keywords = re.split(r', |;', keywords) keywords = [keyword.strip().strip('.') for keyword in keywords] else: keywords = None # Extract DOI doi_match = re.search(r'DOI: (.+)(?=<)', html) if not doi_match: doi_match = re.search(r'DOI: (.+)', html) doi = doi_match.group(1) if doi_match else None # Article_id article_id = str(uuid.uuid4()) article_data = { "article_id": article_id, "title": title, "authors": authors, "corresponding_authors": None, "submit_datetime": submitted_date, "publish_datetime": publish_date, "keywords": keywords, "MSC": msc, "URL": article_url, "DOI": doi, "publisher": "Texas State University", "journal": "Electronic Journal of Differential Equations", "volume": volume, "issue": issue, "page": pp, } articleData.append(article_data) # Author info table = soup.find('table') for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: cell = cell.split("\n") cell = [element.replace('email: ', '') for element in cell] cell = [c.strip() for c in cell] # Data processing name = cell[0].split(" ") affiliation = ', '.join(cell[1:-1]) email = cell[-1] author_data = { "author_id": str(uuid.uuid4()), "from_article": article_id, "first_name": name[0], "last_name": name[-1], "middle_name": name[1:len(name) - 1] if len(name) > 2 else None, "affiliation": [{ "year": volume, "affiliation": affiliation, "email": email, }] } authorData.append(author_data) # Save the data periodically based on batch size if len(articleData) % batch_size == 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") articleData.clear() if len(authorData) % batch_size == 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") authorData.clear() index = "https://ejde.math.txstate.edu/indexleft.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') # Find all the URL links under the first (Volumes) section volume_links = soup.select('font > a[href]') # Extract and store the URLs in a list using list comprehension url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1] authorData = [] articleData = [] batch_size = 100 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_article, url) for url in url_list] # Wait for all tasks to complete for future in as_completed(futures): try: future.result() except Exception as e: print("An error occurred:", str(e)) # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") # Transfer to large file and delete the temporary storage files ejde_save.Transf() ejde_save.delete()