Update the code these weeks

2023-07-14 18:50:36 +08:00 · 2023-07-14 18:50:36 +08:00 · d8addf5204
commit d8addf5204
parent 04806fa367
9 changed files with 644 additions and 0 deletions
--- a/EJDE_spider/Transf.py
+++ b/EJDE_spider/Transf.py
@ -0,0 +1,38 @@
 import os
 import json
 # Function
 # Get the data from input files
 def Read(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data.extend(json.load(file))
    return data
 # Write into output files
 def Write(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)
 # Path of files need to be read
 folder_path1 = '.\ejde_buffer\Author'
 folder_path2 = '.\ejde_buffer\Article'
 # Read the data in the files
 Author_data = Read(folder_path1)
 Article_data = Read(folder_path2)
 # The path of output files
 output_file1 = '.\ejde_buffer\Author_output_file.json'
 output_file2 = '.\ejde_buffer\Article_output_file.json'
 # Write into files
 Write(Author_data, output_file1)
 Write(Article_data, output_file2)
 # End
 print("\nData has been written into files.")
--- a/EJDE_spider/ejde_scrawler.py
+++ b/EJDE_spider/ejde_scrawler.py
@ -0,0 +1,185 @@
 import os
 import uuid
 import requests
 from bs4 import BeautifulSoup
 import re
 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from retrying import retry
 def save_data(dataset, filetype, filename):
    if dataset:
        directory = "./ejde_buffer/" + filetype + "/"
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, filename)
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)
@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(url):
    response = requests.get(url)
    response.raise_for_status()
    baseWeb = url[:url.rfind('/')] + "/"
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    articles = soup.find_all("li")
    for article in articles:
        authors = article.find("strong").text.strip().split(", ")
        title = article.find("em").text.strip()
        article_url = baseWeb + article.find("a")["href"]
        # Access article detail page
        response = requests.get(article_url)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        article_text = soup.get_text()
        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
        volume = volume_match.group(1) if volume_match else None
        # year = volume_match.group(2) if volume_match else None
        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
        pp = pp_match.group(1) if pp_match else None
        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
        issue = issue_match.group(1) if issue_match else None
        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        submitted_date = match.group(1) if match else None
        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        publish_date = match.group(1) if match else None
        # Extract MSC
        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
            msc = msc_match.group(1).strip().strip('.')
            msc = re.split(r', |;', msc)
        else:
            msc = None
        # Extract KeyWords
        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
        if not keywords_match:
            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
        if keywords_match:
            keywords = keywords_match.group(1).strip().replace('\n', '')
            keywords = re.split(r', |;', keywords)
            keywords = [keyword.strip().strip('.') for keyword in keywords]
        else:
            keywords = None
        # Extract DOI
        doi_match = re.search(r'DOI: (.+)(?=<)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None
        # Article_id
        article_id = str(uuid.uuid4())
        article_data = {
            "article_id": article_id,
            "title": title,
            "authors": authors,
            "corresponding_authors": None,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
            "MSC": msc,
            "URL": article_url,
            "DOI": doi,
            "publisher": "Texas State University",
            "journal": "Electronic Journal of Differential Equations",
            "volume": volume,
            "issue": issue,
            "page": pp,
        }
        articleData.append(article_data)
        # Author info
        table = soup.find('table')
        for row in table.find_all('tr'):
            cells = [cell.text.strip() for cell in row.find_all('td')]
            for cell in cells:
                cell = cell.split("\n")
                cell = [element.replace('email: ', '') for element in cell]
                cell = [c.strip() for c in cell]
                # Data processing
                name = cell[0].split(" ")
                affiliation = ', '.join(cell[1:-1])
                email = cell[-1]
                author_data = {
                    "author_id": str(uuid.uuid4()),
                    "from_article": article_id,
                    "first_name": name[0],
                    "last_name": name[-1],
                    "middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
                    "affiliation": [{
                        "year": volume,
                        "affiliation": affiliation,
                        "email": email,
                    }]
                }
                authorData.append(author_data)
        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
            save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
            articleData.clear()
        if len(authorData) % batch_size == 0:
            save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
            authorData.clear()
 index = "https://ejde.math.txstate.edu/indexleft.html"
 response = requests.get(index)
 soup = BeautifulSoup(response.content, 'html.parser')
 # Find all the URL links under the first (Volumes) section
 volume_links = soup.select('font > a[href]')
 # Extract and store the URLs in a list using list comprehension
 url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
 authorData = []
 articleData = []
 batch_size = 500  # Number of articles to process before saving
 executor = ThreadPoolExecutor(max_workers=20)  # Set the number of worker threads
 # Process each URL using multithreading
 futures = [executor.submit(process_article, url) for url in url_list]
 # Wait for all tasks to complete
 for future in as_completed(futures):
    try:
        future.result()
    except Exception as e:
        print("An error occurred:", str(e))
 # Save remaining data
 if articleData:
    save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
 if authorData:
    save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
--- a/SpringerOpen_spider/SD_detail.py
+++ b/SpringerOpen_spider/SD_detail.py
@ -0,0 +1,148 @@
 import uuid
 # ==========获取细节==========
 def Author_dict(soup, article_id, Author_list):
    info = soup.find('article', lang='en')
    author_info = info.find('div', id='author-information-content')
    article_info = info.find('div', class_='c-article-header')
    # Author
    authors = article_info.find('ul', class_='c-article-author-list')
    authors = authors.find_all('li', class_='c-article-author-list__item')
    for author in authors:
        # Name
        author = author.find('a').get_text()
        author = author.split(' ')
        author = [char.replace('-', '') for char in author]
        Firstname = author[0]
        Lastname = author[-1]
        Middlename = ''.join(author[1:-1]) if author[1:-1] else None
        # Year
        Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
        # Affiliation
        Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
        # Email
        Email = None    # Can not reach the data
        # Input into dict
        author_data = {
            "author_id": str(uuid.uuid4()),
            "from_article": article_id,
            "firstname": Firstname,
            "lastname": Lastname,
            "middlename": Middlename,
            "affiliation": [
                {
                    "year": Year,
                    "affiliation": Affiliation,
                    "email": Email
                }
            ]
        }
        Author_list.append(author_data)
    return Author_list
 def Article_dict(soup, url, article_id):
    info = soup.find('article', lang='en')
    article_info = info.find('div', class_='c-article-header')
    # Title
    Title = article_info.find('h1').get_text()
    # Author
    Author = []         # A new empty list
    author_list = article_info.find('ul', class_='c-article-author-list')
    authors = author_list.find_all('li', class_='c-article-author-list__item')
    for author in authors:
        author = author.find('a').get_text()
        author = [char.replace('-', '') for char in author]
        author = ''.join(author)
        Author.append(author)
    # Corresponding_author
    Corresponding_author = []       # A new empty list
    corresponding_author_list = info.find('p', id='corresponding-author-list')
    corresponding_authors = corresponding_author_list.find_all('a')
    if Corresponding_author is not None:
        for corresponding_author in corresponding_authors:
            corresponding_author = corresponding_author.get_text()
            corresponding_author = [char.replace('-', '') for char in corresponding_author]
            corresponding_author = ''.join(corresponding_author)
            Corresponding_author.append(corresponding_author)
    # Submitted_datetime & Published_datetime
    Time = []
    time_list = info.find('ul', class_='c-bibliographic-information__list')
    times = time_list.find_all('time')
    for time in times:
        time = time.get_text()
        Time.append(time)
    Submitted_date = Time[0]
    Publish_date = Time[-1]
    # keyword
    Keyword = []        # A new empty list
    keyword_list = info.find('ul', class_='c-article-subject-list')
    if keyword_list is not None:
        keywords = keyword_list.find_all('li')
        for keyword in keywords:
            keyword = keyword.get_text()
            Keyword.append(keyword)
    # MSC
    MSC = None      # SpringerOpen.com does not have MSC
    # DOI
    DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
    if DOI is not None:
        DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text()
    # Publisher
    Publisher = 'springeropen.com'
    # Journal
    Journal = info.find('p', class_='c-article-info-details')
    Journal = Journal.find('i').get_text()
    # Volume
    Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
    # Issue
    Issue = info.find('p', class_='c-article-info-details')
    Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text()
    # Page
    Page = None
    # Input into dict
    article_data = {
        "article_id": article_id,
        "title": Title,
        "authors": Author,
        "corresponding_authors": Corresponding_author,
        "submit_datetime": Submitted_date,
        "publish_datetime": Publish_date,
        "keywords": Keyword,
        "MSC": MSC,
        "URL": url,
        "DOI": DOI,
        "publisher": Publisher,
        "journal": Journal,
        "volume": Volume,
        "issue": Issue,
        "page": Page,
    }
    return article_data
--- a/SpringerOpen_spider/SD_header.py
+++ b/SpringerOpen_spider/SD_header.py
@ -0,0 +1,25 @@
 import random
 # 用户代理地址池
 uapools=[
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
 ]
 def header():
    # 网站请求头
    headers = {
        'User-Agent': random.choice(uapools),
    }
    return headers
--- a/SpringerOpen_spider/SD_link.py
+++ b/SpringerOpen_spider/SD_link.py
@ -0,0 +1,15 @@
 import requests
 from bs4 import BeautifulSoup
 # 标准访问格式
 def Link(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # 检查请求是否成功
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        return soup
    except requests.exceptions.RequestException as e:
        print("Wrong:", e)
        return None
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@ -0,0 +1,75 @@
 import urllib
 import SD_header
 import SD_link
 import SD_threads
 import SD_save
 from urllib.parse import urljoin
 '''
    爬取网站：https://www.springeropen.com
    ==========运行顺序==========
    1、SD_main       获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接
    2、SD_threads    多线程管控 -> 调用SD_scrawl
    3、SD_scrawl     获取论文详情页链接 -> 调用SD_detail
    4、SD_detail     获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件（json）暂存
    5、SD_main       调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *6、SD_save(可选)  删除暂存区内部所有文件（注意备份）
 '''
 # 存放网页链接的空列表
 Links = []          # A list for links
 Webs = []           # A list for web url
 # 存放爬取数据的空列表
 Article_data = []
 Author_data = []
 # ==========访问论文列表页==========
 # 获取数学类期刊网站链接
 headers = SD_header.header()
 soup = SD_link.Link('https://www.springeropen.com/journals', headers)
 hrefs = soup.find('ol', id='Mathematics-list')
 hrefs = hrefs.find_all('a')
 for href in hrefs:
    href = 'http:' + href.get('href') + '/articles'
    sub_soup = SD_link.Link(href, headers)
    # 获取当前期刊的文章列表页数
    pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text()
    pp = pp.split(' ')[-1]
    # 修饰链接
    url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=')
    # 存入字典和列表
    web = {
        "url": url,
        "page": int(pp)
    }
    Webs.append(web)
 # 处理链接后存入待处理的链接列表
 for web in Webs:
    for page in range(1, web['page']+1):
        link = web['url'] + str(page)
        Links.append(link)
 print('\nThe links have been stored!\n')
 # 进入多线程池开始爬取
 SD_threads.Threads(Links, Article_data, Author_data)
 # json文件汇总
 SD_save.Transf()
 # # ==========删除所有暂存的小文件(可选,注意备份)===========
 # SD_save.delete('./SpringerOpen_buffer/Article_TS/')
 # SD_save.delete('./SpringerOpen_buffer/Author_TS/')
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@ -0,0 +1,85 @@
 import os
 import json
 # 将数据暂存到json文件（小文件 50篇）
 def save_data(dataset, filetype, filename):
    if dataset:
        directory = "./SpringerOpen_buffer/" + filetype + "/"
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, filename)
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)
 # 文件最终筛选汇总
 def Transf():
    def Read(folder_path, output_files):
        # 新建文件夹
        os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
        os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    # 筛选文章
                    data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
                    data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
                    data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
                    data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
                    Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
                    # 转存
                    for index in range(0, 4):
                        with open(output_files[index], 'w', encoding='utf-8') as file:
                            json.dump(Data[index], file, indent=4)
        # 读取路径
    author_folder_path = './SpringerOpen_buffer/Author'
    article_folder_path = './SpringerOpen_buffer/Article'
    # 存储路径
    author_output_file = [
        './SpringerOpen_buffer/Author_output/Author_output_file(oldest).json',
        './SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json',
        './SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json',
        './SpringerOpen_buffer/Author_output/Author_output_file(newest).json'
    ]
    article_output_file = [
        './SpringerOpen_buffer/Article_output/Article_output_file(oldest).json',
        './SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json',
        './SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json',
        './SpringerOpen_buffer/Article_output/Article_output_file(newest).json'
    ]
    # 读取并写入文件
    Read(author_folder_path, author_output_file)
    Read(article_folder_path, article_output_file)
    # End
    print("\nData has been written into files.")
 # 删除暂存区文件
 def delete(folder_path):
    file_names = os.listdir(folder_path)
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
    print('\nAttention: The temporary storage files have been deleted!')
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@ -0,0 +1,48 @@
 import time
 import urllib
 import uuid
 from urllib.parse import urljoin
 import SD_header
 import SD_link
 import SD_detail
 import SD_save
 # ==========获取论文详情页链接==========
 def Scrawl(Link, Article_data, Author_data):
    # 访问论文列表页
    headers = SD_header.header()
    soup = SD_link.Link(Link, headers)
    print(Link)
    # 获得所有论文详情页的链接
    Essay_Ol = soup.find('ol')          # 获取论文列表
    Essay_Li = Essay_Ol.find_all('li')  # 获取全部论文详情页链接
    # 爬取一页论文列表中所有的论文（50篇）
    for Essay_hrefs in Essay_Li:
        Essay_href = Essay_hrefs.find('a', itemprop='url')
        if Essay_href is not None:
            time.sleep(0.1)
            sub_Link = Essay_href.get('href')  # 获取链接
            sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link)
            # ==========访问论文详情页==========
            sub_soup = SD_link.Link(sub_Link, headers)          # 获取详情
            article_id = str(uuid.uuid4())                      # 标号
            # 获取细节并且添加进对应列表
            Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
            Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
    # 放入json文件暂存（小文件）
    if Article_data:
        index = str(uuid.uuid4())
        SD_save.save_data(Article_data, "Article_TS",  index + ".json")
        print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/')
    if Author_data:
        index = str(uuid.uuid4())
        SD_save.save_data(Author_data, "Author_TS",  index + ".json")
        print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/')
--- a/SpringerOpen_spider/SD_threads.py
+++ b/SpringerOpen_spider/SD_threads.py
@ -0,0 +1,25 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 import SD_scrawl
 # ==========多线程处理==========
 def Threads(Links, Article_data, Author_data):
    executor = ThreadPoolExecutor(max_workers=20)       # 进程池
    # 进行多线程处理
    futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
    # max_iterations = 5      # 最多同时进行数
    # iteration_count = 0     # 计数器
    # 等待所有进程完成
    for future in as_completed(futures):
        try:
            future.result()
        #     # 限制最大同时爬取数
        #     iteration_count += 1  # Increment the counter
        #     if iteration_count >= max_iterations:
        #         break
        except Exception as e:
            print("An error occurred:", str(e))
    wait(futures)