From d8addf520416fd65427e34730098fc76cd2e04ab Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Fri, 14 Jul 2023 18:50:36 +0800 Subject: [PATCH] Update the code these weeks --- EJDE_spider/Transf.py | 38 ++++++ EJDE_spider/ejde_scrawler.py | 185 ++++++++++++++++++++++++++++++ SpringerOpen_spider/SD_detail.py | 148 ++++++++++++++++++++++++ SpringerOpen_spider/SD_header.py | 25 ++++ SpringerOpen_spider/SD_link.py | 15 +++ SpringerOpen_spider/SD_main.py | 75 ++++++++++++ SpringerOpen_spider/SD_save.py | 85 ++++++++++++++ SpringerOpen_spider/SD_scrawl.py | 48 ++++++++ SpringerOpen_spider/SD_threads.py | 25 ++++ 9 files changed, 644 insertions(+) create mode 100644 EJDE_spider/Transf.py create mode 100644 EJDE_spider/ejde_scrawler.py create mode 100644 SpringerOpen_spider/SD_detail.py create mode 100644 SpringerOpen_spider/SD_header.py create mode 100644 SpringerOpen_spider/SD_link.py create mode 100644 SpringerOpen_spider/SD_main.py create mode 100644 SpringerOpen_spider/SD_save.py create mode 100644 SpringerOpen_spider/SD_scrawl.py create mode 100644 SpringerOpen_spider/SD_threads.py diff --git a/EJDE_spider/Transf.py b/EJDE_spider/Transf.py new file mode 100644 index 0000000..dce6a10 --- /dev/null +++ b/EJDE_spider/Transf.py @@ -0,0 +1,38 @@ +import os +import json + +# Function +# Get the data from input files +def Read(folder_path): + data = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data.extend(json.load(file)) + return data + +# Write into output files +def Write(data, output_file): + with open(output_file, 'w', encoding='utf-8') as file: + json.dump(data, file, indent=4) + +# Path of files need to be read +folder_path1 = '.\ejde_buffer\Author' +folder_path2 = '.\ejde_buffer\Article' + +# Read the data in the files +Author_data = Read(folder_path1) +Article_data = Read(folder_path2) + +# The path of output files +output_file1 = '.\ejde_buffer\Author_output_file.json' +output_file2 = '.\ejde_buffer\Article_output_file.json' + +# Write into files +Write(Author_data, output_file1) +Write(Article_data, output_file2) + +# End +print("\nData has been written into files.") \ No newline at end of file diff --git a/EJDE_spider/ejde_scrawler.py b/EJDE_spider/ejde_scrawler.py new file mode 100644 index 0000000..49b95e0 --- /dev/null +++ b/EJDE_spider/ejde_scrawler.py @@ -0,0 +1,185 @@ +import os +import uuid +import requests +from bs4 import BeautifulSoup +import re +import json +from concurrent.futures import ThreadPoolExecutor, as_completed +from retrying import retry + + +def save_data(dataset, filetype, filename): + if dataset: + directory = "./ejde_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +@retry(wait_fixed=5000, stop_max_attempt_number=5) +def process_article(url): + response = requests.get(url) + response.raise_for_status() + + baseWeb = url[:url.rfind('/')] + "/" + html = response.text + soup = BeautifulSoup(html, "html.parser") + + articles = soup.find_all("li") + + for article in articles: + authors = article.find("strong").text.strip().split(", ") + title = article.find("em").text.strip() + article_url = baseWeb + article.find("a")["href"] + + # Access article detail page + response = requests.get(article_url) + html = response.text + soup = BeautifulSoup(html, 'html.parser') + + article_text = soup.get_text() + + # Extract volume + volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) + volume = volume_match.group(1) if volume_match else None + # year = volume_match.group(2) if volume_match else None + + # Extract pp + pp_match = re.search(r'pp\. (\d+-\d+)', article_text) + pp = pp_match.group(1) if pp_match else None + + # Extract issue + issue_match = re.search(r'No\. (\d+)', article_text) + issue = issue_match.group(1) if issue_match else None + + # Extract submission date + match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) + submitted_date = match.group(1) if match else None + + # Extract publication date + match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) + publish_date = match.group(1) if match else None + + # Extract MSC + msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) + if not msc_match: + msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) + if msc_match: + msc = msc_match.group(1).strip().strip('.') + msc = re.split(r', |;', msc) + else: + msc = None + + # Extract KeyWords + keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) + if not keywords_match: + keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL) + if keywords_match: + keywords = keywords_match.group(1).strip().replace('\n', '') + keywords = re.split(r', |;', keywords) + keywords = [keyword.strip().strip('.') for keyword in keywords] + else: + keywords = None + + # Extract DOI + doi_match = re.search(r'DOI: (.+)(?=<)', html) + if not doi_match: + doi_match = re.search(r'DOI: (.+)', html) + doi = doi_match.group(1) if doi_match else None + + # Article_id + article_id = str(uuid.uuid4()) + + article_data = { + "article_id": article_id, + "title": title, + "authors": authors, + "corresponding_authors": None, + "submit_datetime": submitted_date, + "publish_datetime": publish_date, + "keywords": keywords, + "MSC": msc, + "URL": article_url, + "DOI": doi, + "publisher": "Texas State University", + "journal": "Electronic Journal of Differential Equations", + "volume": volume, + "issue": issue, + "page": pp, + } + articleData.append(article_data) + + # Author info + table = soup.find('table') + for row in table.find_all('tr'): + cells = [cell.text.strip() for cell in row.find_all('td')] + for cell in cells: + cell = cell.split("\n") + cell = [element.replace('email: ', '') for element in cell] + cell = [c.strip() for c in cell] + + # Data processing + name = cell[0].split(" ") + affiliation = ', '.join(cell[1:-1]) + email = cell[-1] + + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "first_name": name[0], + "last_name": name[-1], + "middle_name": name[1:len(name) - 1] if len(name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email, + }] + } + authorData.append(author_data) + + # Save the data periodically based on batch size + if len(articleData) % batch_size == 0: + save_data(articleData, "Article", str(uuid.uuid4()) + ".json") + articleData.clear() + + if len(authorData) % batch_size == 0: + save_data(authorData, "Author", str(uuid.uuid4()) + ".json") + authorData.clear() + + +index = "https://ejde.math.txstate.edu/indexleft.html" +response = requests.get(index) +soup = BeautifulSoup(response.content, 'html.parser') + +# Find all the URL links under the first (Volumes) section +volume_links = soup.select('font > a[href]') + +# Extract and store the URLs in a list using list comprehension +url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1] + +authorData = [] +articleData = [] + +batch_size = 500 # Number of articles to process before saving +executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads + +# Process each URL using multithreading +futures = [executor.submit(process_article, url) for url in url_list] + +# Wait for all tasks to complete +for future in as_completed(futures): + try: + future.result() + except Exception as e: + print("An error occurred:", str(e)) + +# Save remaining data +if articleData: + save_data(articleData, "Article", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/") + +if authorData: + save_data(authorData, "Author", str(uuid.uuid4()) + ".json") + print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/") diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py new file mode 100644 index 0000000..8dd02dc --- /dev/null +++ b/SpringerOpen_spider/SD_detail.py @@ -0,0 +1,148 @@ +import uuid + +# ==========获取细节========== +def Author_dict(soup, article_id, Author_list): + info = soup.find('article', lang='en') + author_info = info.find('div', id='author-information-content') + article_info = info.find('div', class_='c-article-header') + + # Author + authors = article_info.find('ul', class_='c-article-author-list') + authors = authors.find_all('li', class_='c-article-author-list__item') + for author in authors: + # Name + author = author.find('a').get_text() + author = author.split(' ') + author = [char.replace('-', '') for char in author] + + Firstname = author[0] + Lastname = author[-1] + Middlename = ''.join(author[1:-1]) if author[1:-1] else None + + # Year + Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() + + # Affiliation + Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() + + # Email + Email = None # Can not reach the data + + # Input into dict + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": Firstname, + "lastname": Lastname, + "middlename": Middlename, + "affiliation": [ + { + "year": Year, + "affiliation": Affiliation, + "email": Email + } + ] + } + + Author_list.append(author_data) + + return Author_list + + +def Article_dict(soup, url, article_id): + info = soup.find('article', lang='en') + article_info = info.find('div', class_='c-article-header') + + # Title + Title = article_info.find('h1').get_text() + + # Author + Author = [] # A new empty list + + author_list = article_info.find('ul', class_='c-article-author-list') + authors = author_list.find_all('li', class_='c-article-author-list__item') + for author in authors: + author = author.find('a').get_text() + author = [char.replace('-', '') for char in author] + author = ''.join(author) + Author.append(author) + + # Corresponding_author + Corresponding_author = [] # A new empty list + + corresponding_author_list = info.find('p', id='corresponding-author-list') + corresponding_authors = corresponding_author_list.find_all('a') + if Corresponding_author is not None: + for corresponding_author in corresponding_authors: + corresponding_author = corresponding_author.get_text() + corresponding_author = [char.replace('-', '') for char in corresponding_author] + corresponding_author = ''.join(corresponding_author) + Corresponding_author.append(corresponding_author) + + # Submitted_datetime & Published_datetime + Time = [] + + time_list = info.find('ul', class_='c-bibliographic-information__list') + times = time_list.find_all('time') + for time in times: + time = time.get_text() + Time.append(time) + + Submitted_date = Time[0] + Publish_date = Time[-1] + + # keyword + Keyword = [] # A new empty list + + keyword_list = info.find('ul', class_='c-article-subject-list') + if keyword_list is not None: + keywords = keyword_list.find_all('li') + for keyword in keywords: + keyword = keyword.get_text() + Keyword.append(keyword) + + # MSC + MSC = None # SpringerOpen.com does not have MSC + + # DOI + DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi') + if DOI is not None: + DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text() + + # Publisher + Publisher = 'springeropen.com' + + # Journal + Journal = info.find('p', class_='c-article-info-details') + Journal = Journal.find('i').get_text() + + # Volume + Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() + + # Issue + Issue = info.find('p', class_='c-article-info-details') + Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text() + + # Page + Page = None + + # Input into dict + article_data = { + "article_id": article_id, + "title": Title, + "authors": Author, + "corresponding_authors": Corresponding_author, + "submit_datetime": Submitted_date, + "publish_datetime": Publish_date, + "keywords": Keyword, + "MSC": MSC, + "URL": url, + "DOI": DOI, + "publisher": Publisher, + "journal": Journal, + "volume": Volume, + "issue": Issue, + "page": Page, + } + + return article_data \ No newline at end of file diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py new file mode 100644 index 0000000..162e82d --- /dev/null +++ b/SpringerOpen_spider/SD_header.py @@ -0,0 +1,25 @@ +import random + +# 用户代理地址池 +uapools=[ + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" , + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', + 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', +] + +def header(): + # 网站请求头 + headers = { + 'User-Agent': random.choice(uapools), + } + + return headers + + + diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py new file mode 100644 index 0000000..a46d542 --- /dev/null +++ b/SpringerOpen_spider/SD_link.py @@ -0,0 +1,15 @@ +import requests +from bs4 import BeautifulSoup + +# 标准访问格式 +def Link(url, headers): + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + html = response.text + soup = BeautifulSoup(html, 'html.parser') + return soup + + except requests.exceptions.RequestException as e: + print("Wrong:", e) + return None diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py new file mode 100644 index 0000000..e94fd72 --- /dev/null +++ b/SpringerOpen_spider/SD_main.py @@ -0,0 +1,75 @@ +import urllib + +import SD_header +import SD_link +import SD_threads +import SD_save +from urllib.parse import urljoin + +''' + 爬取网站:https://www.springeropen.com + + ==========运行顺序========== + 1、SD_main 获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接 + 2、SD_threads 多线程管控 -> 调用SD_scrawl + 3、SD_scrawl 获取论文详情页链接 -> 调用SD_detail + 4、SD_detail 获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件(json)暂存 + 5、SD_main 调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *6、SD_save(可选) 删除暂存区内部所有文件(注意备份) +''' + + +# 存放网页链接的空列表 +Links = [] # A list for links +Webs = [] # A list for web url + +# 存放爬取数据的空列表 +Article_data = [] +Author_data = [] + +# ==========访问论文列表页========== +# 获取数学类期刊网站链接 +headers = SD_header.header() +soup = SD_link.Link('https://www.springeropen.com/journals', headers) + +hrefs = soup.find('ol', id='Mathematics-list') +hrefs = hrefs.find_all('a') +for href in hrefs: + href = 'http:' + href.get('href') + '/articles' + sub_soup = SD_link.Link(href, headers) + + # 获取当前期刊的文章列表页数 + pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text() + pp = pp.split(' ')[-1] + + # 修饰链接 + url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=') + + # 存入字典和列表 + web = { + "url": url, + "page": int(pp) + } + Webs.append(web) + +# 处理链接后存入待处理的链接列表 +for web in Webs: + for page in range(1, web['page']+1): + link = web['url'] + str(page) + Links.append(link) + +print('\nThe links have been stored!\n') + +# 进入多线程池开始爬取 +SD_threads.Threads(Links, Article_data, Author_data) + +# json文件汇总 +SD_save.Transf() + +# # ==========删除所有暂存的小文件(可选,注意备份)=========== +# SD_save.delete('./SpringerOpen_buffer/Article_TS/') +# SD_save.delete('./SpringerOpen_buffer/Author_TS/') + + + + diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py new file mode 100644 index 0000000..b088f5c --- /dev/null +++ b/SpringerOpen_spider/SD_save.py @@ -0,0 +1,85 @@ +import os +import json + + +# 将数据暂存到json文件(小文件 50篇) +def save_data(dataset, filetype, filename): + if dataset: + directory = "./SpringerOpen_buffer/" + filetype + "/" + os.makedirs(directory, exist_ok=True) + filepath = os.path.join(directory, filename) + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(dataset, json_file, indent=4) + print(filetype + " data have been added to", filepath) + + +# 文件最终筛选汇总 +def Transf(): + def Read(folder_path, output_files): + # 新建文件夹 + os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True) + os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True) + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # 筛选文章 + data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + + data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + + data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + + data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + + # 转存 + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) + + # 读取路径 + + author_folder_path = './SpringerOpen_buffer/Author' + article_folder_path = './SpringerOpen_buffer/Article' + + # 存储路径 + author_output_file = [ + './SpringerOpen_buffer/Author_output/Author_output_file(oldest).json', + './SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json', + './SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json', + './SpringerOpen_buffer/Author_output/Author_output_file(newest).json' + ] + + article_output_file = [ + './SpringerOpen_buffer/Article_output/Article_output_file(oldest).json', + './SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json', + './SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json', + './SpringerOpen_buffer/Article_output/Article_output_file(newest).json' + ] + + # 读取并写入文件 + Read(author_folder_path, author_output_file) + Read(article_folder_path, article_output_file) + + # End + print("\nData has been written into files.") + +# 删除暂存区文件 +def delete(folder_path): + file_names = os.listdir(folder_path) + + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + + print('\nAttention: The temporary storage files have been deleted!') diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py new file mode 100644 index 0000000..c79f41a --- /dev/null +++ b/SpringerOpen_spider/SD_scrawl.py @@ -0,0 +1,48 @@ +import time +import urllib +import uuid +from urllib.parse import urljoin + +import SD_header +import SD_link +import SD_detail +import SD_save + +# ==========获取论文详情页链接========== +def Scrawl(Link, Article_data, Author_data): + # 访问论文列表页 + headers = SD_header.header() + soup = SD_link.Link(Link, headers) + print(Link) + + # 获得所有论文详情页的链接 + Essay_Ol = soup.find('ol') # 获取论文列表 + Essay_Li = Essay_Ol.find_all('li') # 获取全部论文详情页链接 + + # 爬取一页论文列表中所有的论文(50篇) + for Essay_hrefs in Essay_Li: + Essay_href = Essay_hrefs.find('a', itemprop='url') + if Essay_href is not None: + time.sleep(0.1) + + sub_Link = Essay_href.get('href') # 获取链接 + sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link) + + # ==========访问论文详情页========== + sub_soup = SD_link.Link(sub_Link, headers) # 获取详情 + article_id = str(uuid.uuid4()) # 标号 + + # 获取细节并且添加进对应列表 + Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id)) + Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data) + + # 放入json文件暂存(小文件) + if Article_data: + index = str(uuid.uuid4()) + SD_save.save_data(Article_data, "Article_TS", index + ".json") + print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/') + + if Author_data: + index = str(uuid.uuid4()) + SD_save.save_data(Author_data, "Author_TS", index + ".json") + print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/') \ No newline at end of file diff --git a/SpringerOpen_spider/SD_threads.py b/SpringerOpen_spider/SD_threads.py new file mode 100644 index 0000000..d5d7527 --- /dev/null +++ b/SpringerOpen_spider/SD_threads.py @@ -0,0 +1,25 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed, wait +import SD_scrawl + +# ==========多线程处理========== +def Threads(Links, Article_data, Author_data): + executor = ThreadPoolExecutor(max_workers=20) # 进程池 + + # 进行多线程处理 + futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links] + + # max_iterations = 5 # 最多同时进行数 + # iteration_count = 0 # 计数器 + + # 等待所有进程完成 + for future in as_completed(futures): + try: + future.result() + # # 限制最大同时爬取数 + # iteration_count += 1 # Increment the counter + # if iteration_count >= max_iterations: + # break + except Exception as e: + print("An error occurred:", str(e)) + + wait(futures) \ No newline at end of file