From d8addf520416fd65427e34730098fc76cd2e04ab Mon Sep 17 00:00:00 2001
From: XCX <1361986662@qq.com>
Date: Fri, 14 Jul 2023 18:50:36 +0800
Subject: [PATCH] Update the code these weeks

---
 EJDE_spider/Transf.py             |  38 ++++++
 EJDE_spider/ejde_scrawler.py      | 185 ++++++++++++++++++++++++++++++
 SpringerOpen_spider/SD_detail.py  | 148 ++++++++++++++++++++++++
 SpringerOpen_spider/SD_header.py  |  25 ++++
 SpringerOpen_spider/SD_link.py    |  15 +++
 SpringerOpen_spider/SD_main.py    |  75 ++++++++++++
 SpringerOpen_spider/SD_save.py    |  85 ++++++++++++++
 SpringerOpen_spider/SD_scrawl.py  |  48 ++++++++
 SpringerOpen_spider/SD_threads.py |  25 ++++
 9 files changed, 644 insertions(+)
 create mode 100644 EJDE_spider/Transf.py
 create mode 100644 EJDE_spider/ejde_scrawler.py
 create mode 100644 SpringerOpen_spider/SD_detail.py
 create mode 100644 SpringerOpen_spider/SD_header.py
 create mode 100644 SpringerOpen_spider/SD_link.py
 create mode 100644 SpringerOpen_spider/SD_main.py
 create mode 100644 SpringerOpen_spider/SD_save.py
 create mode 100644 SpringerOpen_spider/SD_scrawl.py
 create mode 100644 SpringerOpen_spider/SD_threads.py

diff --git a/EJDE_spider/Transf.py b/EJDE_spider/Transf.py
new file mode 100644
index 0000000..dce6a10
--- /dev/null
+++ b/EJDE_spider/Transf.py
@@ -0,0 +1,38 @@
+import os
+import json
+
+# Function
+# Get the data from input files
+def Read(folder_path):
+    data = []
+
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data.extend(json.load(file))
+    return data
+
+# Write into output files
+def Write(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as file:
+        json.dump(data, file, indent=4)
+
+# Path of files need to be read
+folder_path1 = '.\ejde_buffer\Author'
+folder_path2 = '.\ejde_buffer\Article'
+
+# Read the data in the files
+Author_data = Read(folder_path1)
+Article_data = Read(folder_path2)
+
+# The path of output files
+output_file1 = '.\ejde_buffer\Author_output_file.json'
+output_file2 = '.\ejde_buffer\Article_output_file.json'
+
+# Write into files
+Write(Author_data, output_file1)
+Write(Article_data, output_file2)
+
+# End
+print("\nData has been written into files.")
\ No newline at end of file
diff --git a/EJDE_spider/ejde_scrawler.py b/EJDE_spider/ejde_scrawler.py
new file mode 100644
index 0000000..49b95e0
--- /dev/null
+++ b/EJDE_spider/ejde_scrawler.py
@@ -0,0 +1,185 @@
+import os
+import uuid
+import requests
+from bs4 import BeautifulSoup
+import re
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from retrying import retry
+
+
+def save_data(dataset, filetype, filename):
+    if dataset:
+        directory = "./ejde_buffer/" + filetype + "/"
+        os.makedirs(directory, exist_ok=True)
+        filepath = os.path.join(directory, filename)
+        with open(filepath, "w", encoding='utf-8') as json_file:
+            json.dump(dataset, json_file, indent=4)
+        print(filetype + " data have been added to", filepath)
+
+
+@retry(wait_fixed=5000, stop_max_attempt_number=5)
+def process_article(url):
+    response = requests.get(url)
+    response.raise_for_status()
+
+    baseWeb = url[:url.rfind('/')] + "/"
+    html = response.text
+    soup = BeautifulSoup(html, "html.parser")
+
+    articles = soup.find_all("li")
+
+    for article in articles:
+        authors = article.find("strong").text.strip().split(", ")
+        title = article.find("em").text.strip()
+        article_url = baseWeb + article.find("a")["href"]
+
+        # Access article detail page
+        response = requests.get(article_url)
+        html = response.text
+        soup = BeautifulSoup(html, 'html.parser')
+
+        article_text = soup.get_text()
+
+        # Extract volume
+        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
+        volume = volume_match.group(1) if volume_match else None
+        # year = volume_match.group(2) if volume_match else None
+
+        # Extract pp
+        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
+        pp = pp_match.group(1) if pp_match else None
+
+        # Extract issue
+        issue_match = re.search(r'No\. (\d+)', article_text)
+        issue = issue_match.group(1) if issue_match else None
+
+        # Extract submission date
+        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
+        submitted_date = match.group(1) if match else None
+
+        # Extract publication date
+        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
+        publish_date = match.group(1) if match else None
+
+        # Extract MSC
+        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
+        if not msc_match:
+            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
+        if msc_match:
+            msc = msc_match.group(1).strip().strip('.')
+            msc = re.split(r', |;', msc)
+        else:
+            msc = None
+
+        # Extract KeyWords
+        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
+        if not keywords_match:
+            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
+        if keywords_match:
+            keywords = keywords_match.group(1).strip().replace('\n', '')
+            keywords = re.split(r', |;', keywords)
+            keywords = [keyword.strip().strip('.') for keyword in keywords]
+        else:
+            keywords = None
+
+        # Extract DOI
+        doi_match = re.search(r'DOI: (.+)(?=<)', html)
+        if not doi_match:
+            doi_match = re.search(r'DOI: (.+)', html)
+        doi = doi_match.group(1) if doi_match else None
+
+        # Article_id
+        article_id = str(uuid.uuid4())
+
+        article_data = {
+            "article_id": article_id,
+            "title": title,
+            "authors": authors,
+            "corresponding_authors": None,
+            "submit_datetime": submitted_date,
+            "publish_datetime": publish_date,
+            "keywords": keywords,
+            "MSC": msc,
+            "URL": article_url,
+            "DOI": doi,
+            "publisher": "Texas State University",
+            "journal": "Electronic Journal of Differential Equations",
+            "volume": volume,
+            "issue": issue,
+            "page": pp,
+        }
+        articleData.append(article_data)
+
+        # Author info
+        table = soup.find('table')
+        for row in table.find_all('tr'):
+            cells = [cell.text.strip() for cell in row.find_all('td')]
+            for cell in cells:
+                cell = cell.split("\n")
+                cell = [element.replace('email: ', '') for element in cell]
+                cell = [c.strip() for c in cell]
+
+                # Data processing
+                name = cell[0].split(" ")
+                affiliation = ', '.join(cell[1:-1])
+                email = cell[-1]
+
+                author_data = {
+                    "author_id": str(uuid.uuid4()),
+                    "from_article": article_id,
+                    "first_name": name[0],
+                    "last_name": name[-1],
+                    "middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
+                    "affiliation": [{
+                        "year": volume,
+                        "affiliation": affiliation,
+                        "email": email,
+                    }]
+                }
+                authorData.append(author_data)
+
+        # Save the data periodically based on batch size
+        if len(articleData) % batch_size == 0:
+            save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
+            articleData.clear()
+
+        if len(authorData) % batch_size == 0:
+            save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
+            authorData.clear()
+
+
+index = "https://ejde.math.txstate.edu/indexleft.html"
+response = requests.get(index)
+soup = BeautifulSoup(response.content, 'html.parser')
+
+# Find all the URL links under the first (Volumes) section
+volume_links = soup.select('font > a[href]')
+
+# Extract and store the URLs in a list using list comprehension
+url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
+
+authorData = []
+articleData = []
+
+batch_size = 500  # Number of articles to process before saving
+executor = ThreadPoolExecutor(max_workers=20)  # Set the number of worker threads
+
+# Process each URL using multithreading
+futures = [executor.submit(process_article, url) for url in url_list]
+
+# Wait for all tasks to complete
+for future in as_completed(futures):
+    try:
+        future.result()
+    except Exception as e:
+        print("An error occurred:", str(e))
+
+# Save remaining data
+if articleData:
+    save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
+    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
+
+if authorData:
+    save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
+    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py
new file mode 100644
index 0000000..8dd02dc
--- /dev/null
+++ b/SpringerOpen_spider/SD_detail.py
@@ -0,0 +1,148 @@
+import uuid
+
+# ==========获取细节==========
+def Author_dict(soup, article_id, Author_list):
+    info = soup.find('article', lang='en')
+    author_info = info.find('div', id='author-information-content')
+    article_info = info.find('div', class_='c-article-header')
+
+    # Author
+    authors = article_info.find('ul', class_='c-article-author-list')
+    authors = authors.find_all('li', class_='c-article-author-list__item')
+    for author in authors:
+        # Name
+        author = author.find('a').get_text()
+        author = author.split(' ')
+        author = [char.replace('-', '') for char in author]
+
+        Firstname = author[0]
+        Lastname = author[-1]
+        Middlename = ''.join(author[1:-1]) if author[1:-1] else None
+
+        # Year
+        Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
+
+        # Affiliation
+        Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
+
+        # Email
+        Email = None    # Can not reach the data
+
+        # Input into dict
+        author_data = {
+            "author_id": str(uuid.uuid4()),
+            "from_article": article_id,
+            "firstname": Firstname,
+            "lastname": Lastname,
+            "middlename": Middlename,
+            "affiliation": [
+                {
+                    "year": Year,
+                    "affiliation": Affiliation,
+                    "email": Email
+                }
+            ]
+        }
+
+        Author_list.append(author_data)
+
+    return Author_list
+
+
+def Article_dict(soup, url, article_id):
+    info = soup.find('article', lang='en')
+    article_info = info.find('div', class_='c-article-header')
+
+    # Title
+    Title = article_info.find('h1').get_text()
+
+    # Author
+    Author = []         # A new empty list
+
+    author_list = article_info.find('ul', class_='c-article-author-list')
+    authors = author_list.find_all('li', class_='c-article-author-list__item')
+    for author in authors:
+        author = author.find('a').get_text()
+        author = [char.replace('-', '') for char in author]
+        author = ''.join(author)
+        Author.append(author)
+
+    # Corresponding_author
+    Corresponding_author = []       # A new empty list
+
+    corresponding_author_list = info.find('p', id='corresponding-author-list')
+    corresponding_authors = corresponding_author_list.find_all('a')
+    if Corresponding_author is not None:
+        for corresponding_author in corresponding_authors:
+            corresponding_author = corresponding_author.get_text()
+            corresponding_author = [char.replace('-', '') for char in corresponding_author]
+            corresponding_author = ''.join(corresponding_author)
+            Corresponding_author.append(corresponding_author)
+
+    # Submitted_datetime & Published_datetime
+    Time = []
+
+    time_list = info.find('ul', class_='c-bibliographic-information__list')
+    times = time_list.find_all('time')
+    for time in times:
+        time = time.get_text()
+        Time.append(time)
+
+    Submitted_date = Time[0]
+    Publish_date = Time[-1]
+
+    # keyword
+    Keyword = []        # A new empty list
+
+    keyword_list = info.find('ul', class_='c-article-subject-list')
+    if keyword_list is not None:
+        keywords = keyword_list.find_all('li')
+        for keyword in keywords:
+            keyword = keyword.get_text()
+            Keyword.append(keyword)
+
+    # MSC
+    MSC = None      # SpringerOpen.com does not have MSC
+
+    # DOI
+    DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
+    if DOI is not None:
+        DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text()
+
+    # Publisher
+    Publisher = 'springeropen.com'
+
+    # Journal
+    Journal = info.find('p', class_='c-article-info-details')
+    Journal = Journal.find('i').get_text()
+
+    # Volume
+    Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
+
+    # Issue
+    Issue = info.find('p', class_='c-article-info-details')
+    Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text()
+
+    # Page
+    Page = None
+
+    # Input into dict
+    article_data = {
+        "article_id": article_id,
+        "title": Title,
+        "authors": Author,
+        "corresponding_authors": Corresponding_author,
+        "submit_datetime": Submitted_date,
+        "publish_datetime": Publish_date,
+        "keywords": Keyword,
+        "MSC": MSC,
+        "URL": url,
+        "DOI": DOI,
+        "publisher": Publisher,
+        "journal": Journal,
+        "volume": Volume,
+        "issue": Issue,
+        "page": Page,
+    }
+
+    return article_data
\ No newline at end of file
diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py
new file mode 100644
index 0000000..162e82d
--- /dev/null
+++ b/SpringerOpen_spider/SD_header.py
@@ -0,0 +1,25 @@
+import random
+
+# 用户代理地址池
+uapools=[
+    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
+    "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+]
+
+def header():
+    # 网站请求头
+    headers = {
+        'User-Agent': random.choice(uapools),
+    }
+
+    return headers
+
+
+
diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py
new file mode 100644
index 0000000..a46d542
--- /dev/null
+++ b/SpringerOpen_spider/SD_link.py
@@ -0,0 +1,15 @@
+import requests
+from bs4 import BeautifulSoup
+
+# 标准访问格式
+def Link(url, headers):
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # 检查请求是否成功
+        html = response.text
+        soup = BeautifulSoup(html, 'html.parser')
+        return soup
+
+    except requests.exceptions.RequestException as e:
+        print("Wrong:", e)
+        return None
diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py
new file mode 100644
index 0000000..e94fd72
--- /dev/null
+++ b/SpringerOpen_spider/SD_main.py
@@ -0,0 +1,75 @@
+import urllib
+
+import SD_header
+import SD_link
+import SD_threads
+import SD_save
+from urllib.parse import urljoin
+
+'''
+    爬取网站：https://www.springeropen.com
+
+    ==========运行顺序==========
+    1、SD_main       获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接
+    2、SD_threads    多线程管控 -> 调用SD_scrawl
+    3、SD_scrawl     获取论文详情页链接 -> 调用SD_detail
+    4、SD_detail     获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件（json）暂存
+    5、SD_main       调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件
+    *6、SD_save(可选)  删除暂存区内部所有文件（注意备份）
+'''
+
+
+# 存放网页链接的空列表
+Links = []          # A list for links
+Webs = []           # A list for web url
+
+# 存放爬取数据的空列表
+Article_data = []
+Author_data = []
+
+# ==========访问论文列表页==========
+# 获取数学类期刊网站链接
+headers = SD_header.header()
+soup = SD_link.Link('https://www.springeropen.com/journals', headers)
+
+hrefs = soup.find('ol', id='Mathematics-list')
+hrefs = hrefs.find_all('a')
+for href in hrefs:
+    href = 'http:' + href.get('href') + '/articles'
+    sub_soup = SD_link.Link(href, headers)
+
+    # 获取当前期刊的文章列表页数
+    pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text()
+    pp = pp.split(' ')[-1]
+
+    # 修饰链接
+    url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=')
+
+    # 存入字典和列表
+    web = {
+        "url": url,
+        "page": int(pp)
+    }
+    Webs.append(web)
+
+# 处理链接后存入待处理的链接列表
+for web in Webs:
+    for page in range(1, web['page']+1):
+        link = web['url'] + str(page)
+        Links.append(link)
+
+print('\nThe links have been stored!\n')
+
+# 进入多线程池开始爬取
+SD_threads.Threads(Links, Article_data, Author_data)
+
+# json文件汇总
+SD_save.Transf()
+
+# # ==========删除所有暂存的小文件(可选,注意备份)===========
+# SD_save.delete('./SpringerOpen_buffer/Article_TS/')
+# SD_save.delete('./SpringerOpen_buffer/Author_TS/')
+
+
+
+
diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py
new file mode 100644
index 0000000..b088f5c
--- /dev/null
+++ b/SpringerOpen_spider/SD_save.py
@@ -0,0 +1,85 @@
+import os
+import json
+
+
+# 将数据暂存到json文件（小文件 50篇）
+def save_data(dataset, filetype, filename):
+    if dataset:
+        directory = "./SpringerOpen_buffer/" + filetype + "/"
+        os.makedirs(directory, exist_ok=True)
+        filepath = os.path.join(directory, filename)
+        with open(filepath, "w", encoding='utf-8') as json_file:
+            json.dump(dataset, json_file, indent=4)
+        print(filetype + " data have been added to", filepath)
+
+
+# 文件最终筛选汇总
+def Transf():
+    def Read(folder_path, output_files):
+        # 新建文件夹
+        os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
+        os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
+
+        for filename in os.listdir(folder_path):
+            if filename.endswith('.json'):
+                file_path = os.path.join(folder_path, filename)
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+
+                    # 筛选文章
+                    data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+
+                    data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+
+                    data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+
+                    data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+
+                    Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+
+                    # 转存
+                    for index in range(0, 4):
+                        with open(output_files[index], 'w', encoding='utf-8') as file:
+                            json.dump(Data[index], file, indent=4)
+
+        # 读取路径
+
+    author_folder_path = './SpringerOpen_buffer/Author'
+    article_folder_path = './SpringerOpen_buffer/Article'
+
+    # 存储路径
+    author_output_file = [
+        './SpringerOpen_buffer/Author_output/Author_output_file(oldest).json',
+        './SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json',
+        './SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json',
+        './SpringerOpen_buffer/Author_output/Author_output_file(newest).json'
+    ]
+
+    article_output_file = [
+        './SpringerOpen_buffer/Article_output/Article_output_file(oldest).json',
+        './SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json',
+        './SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json',
+        './SpringerOpen_buffer/Article_output/Article_output_file(newest).json'
+    ]
+
+    # 读取并写入文件
+    Read(author_folder_path, author_output_file)
+    Read(article_folder_path, article_output_file)
+
+    # End
+    print("\nData has been written into files.")
+
+# 删除暂存区文件
+def delete(folder_path):
+    file_names = os.listdir(folder_path)
+
+    for file_name in file_names:
+        file_path = os.path.join(folder_path, file_name)
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+
+    print('\nAttention: The temporary storage files have been deleted!')
diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py
new file mode 100644
index 0000000..c79f41a
--- /dev/null
+++ b/SpringerOpen_spider/SD_scrawl.py
@@ -0,0 +1,48 @@
+import time
+import urllib
+import uuid
+from urllib.parse import urljoin
+
+import SD_header
+import SD_link
+import SD_detail
+import SD_save
+
+# ==========获取论文详情页链接==========
+def Scrawl(Link, Article_data, Author_data):
+    # 访问论文列表页
+    headers = SD_header.header()
+    soup = SD_link.Link(Link, headers)
+    print(Link)
+
+    # 获得所有论文详情页的链接
+    Essay_Ol = soup.find('ol')          # 获取论文列表
+    Essay_Li = Essay_Ol.find_all('li')  # 获取全部论文详情页链接
+
+    # 爬取一页论文列表中所有的论文（50篇）
+    for Essay_hrefs in Essay_Li:
+        Essay_href = Essay_hrefs.find('a', itemprop='url')
+        if Essay_href is not None:
+            time.sleep(0.1)
+
+            sub_Link = Essay_href.get('href')  # 获取链接
+            sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link)
+
+            # ==========访问论文详情页==========
+            sub_soup = SD_link.Link(sub_Link, headers)          # 获取详情
+            article_id = str(uuid.uuid4())                      # 标号
+
+            # 获取细节并且添加进对应列表
+            Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
+            Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
+
+    # 放入json文件暂存（小文件）
+    if Article_data:
+        index = str(uuid.uuid4())
+        SD_save.save_data(Article_data, "Article_TS",  index + ".json")
+        print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/')
+
+    if Author_data:
+        index = str(uuid.uuid4())
+        SD_save.save_data(Author_data, "Author_TS",  index + ".json")
+        print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/')
\ No newline at end of file
diff --git a/SpringerOpen_spider/SD_threads.py b/SpringerOpen_spider/SD_threads.py
new file mode 100644
index 0000000..d5d7527
--- /dev/null
+++ b/SpringerOpen_spider/SD_threads.py
@@ -0,0 +1,25 @@
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait
+import SD_scrawl
+
+# ==========多线程处理==========
+def Threads(Links, Article_data, Author_data):
+    executor = ThreadPoolExecutor(max_workers=20)       # 进程池
+
+    # 进行多线程处理
+    futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
+
+    # max_iterations = 5      # 最多同时进行数
+    # iteration_count = 0     # 计数器
+
+    # 等待所有进程完成
+    for future in as_completed(futures):
+        try:
+            future.result()
+        #     # 限制最大同时爬取数
+        #     iteration_count += 1  # Increment the counter
+        #     if iteration_count >= max_iterations:
+        #         break
+        except Exception as e:
+            print("An error occurred:", str(e))
+
+    wait(futures)
\ No newline at end of file