Merge branch 'main' of https://git.ecwuuuuu.com/datamining/CST_scrawlCode

2023-07-27 12:05:50 +08:00 · 2023-07-27 12:05:50 +08:00 · ee0f956645
commit ee0f956645
parent 20cf71530a c1e1e59e05
13 changed files with 562 additions and 106 deletions
--- a/EJDE_spider/Transf.py
+++ b/EJDE_spider/Transf.py
@ -1,38 +0,0 @@
 import os
 import json
 # Function
 # Get the data from input files
 def Read(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data.extend(json.load(file))
    return data
 # Write into output files
 def Write(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)
 # Path of files need to be read
 folder_path1 = '.\ejde_buffer\Author'
 folder_path2 = '.\ejde_buffer\Article'
 # Read the data in the files
 Author_data = Read(folder_path1)
 Article_data = Read(folder_path2)
 # The path of output files
 output_file1 = '.\ejde_buffer\Author_output_file.json'
 output_file2 = '.\ejde_buffer\Article_output_file.json'
 # Write into files
 Write(Author_data, output_file1)
 Write(Article_data, output_file2)
 # End
 print("\nData has been written into files.")
--- a/EJDE_spider/ejde_scrawler.py
+++ b/EJDE_spider/ejde_scrawler.py
@ -1,23 +1,22 @@
 import os
 import uuid
 import requests
 from bs4 import BeautifulSoup
 import re
-import json
+import ejde_save
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from retrying import retry
 from bs4 import BeautifulSoup
 '''
    爬取网站：'ejde.math.txstate.edu'
-def save_data(dataset, filetype, filename):
+    ==========运行顺序==========
-    if dataset:
+    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
-        directory = "./ejde_buffer/" + filetype + "/"
+    2、ejde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
-        os.makedirs(directory, exist_ok=True)
+    *3、ejde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
-        filepath = os.path.join(directory, filename)
+'''
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)
 # Article and author detail
@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(url):
    response = requests.get(url)
@ -43,7 +42,7 @@ def process_article(url):
        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
-        volume = volume_match.group(1) if volume_match else None
+        volume = str(volume_match.group(1)) if volume_match else None
        # year = volume_match.group(2) if volume_match else None
        # Extract pp
@ -141,11 +140,11 @@ def process_article(url):
        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
-            save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
+            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
            articleData.clear()
        if len(authorData) % batch_size == 0:
-            save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
+            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
            authorData.clear()
@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
 authorData = []
 articleData = []
-batch_size = 500  # Number of articles to process before saving
+batch_size = 5  # Number of articles to process before saving
 executor = ThreadPoolExecutor(max_workers=20)  # Set the number of worker threads
 # Process each URL using multithreading
@ -176,10 +175,14 @@ for future in as_completed(futures):
        print("An error occurred:", str(e))
 # Save remaining data
-if articleData:
+if len(articleData) > 0:
-    save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
+    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
-    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
+    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
-if authorData:
+if len(authorData) > 0:
-    save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
+    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
-    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
+    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
 # Transfer to large file and delete the temporary storage files
 ejde_save.Transf()
 ejde_save.delete()
--- a/EJDE_spider/ejde_save.py
+++ b/EJDE_spider/ejde_save.py
@ -0,0 +1,93 @@
 import os
 import json
 # Save data
 def save_data(dataset, filetype, filename):
    if dataset:
        directory = "./ejde_buffer/" + filetype + "/"
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, filename)
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)
 # Write into output files
 def Transf():
    def Read(folder_path, output_files):
        # Create new folders
        os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
        os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
        data_oldest = []
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                for Dict in data:
                    if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                        # Select data
                        data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
                        data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
                        data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
                        data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
                        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
                        # Transfer
                        for index in range(0, 4):
                            with open(output_files[index], 'w', encoding='utf-8') as file:
                                json.dump(Data[index], file, indent=4)
    # The path of reading
    author_folder_path = './ejde_buffer/Author_TS'
    article_folder_path = './ejde_buffer/Article_TS'
    # The path of storage
    author_output_file = [
        './ejde_buffer/Author_output/Author_output_file(oldest).json',
        './ejde_buffer/Author_output/Author_output_file(2010-2014).json',
        './ejde_buffer/Author_output/Author_output_file(2015-2020).json',
        './ejde_buffer/Author_output/Author_output_file(newest).json'
    ]
    article_output_file = [
        './ejde_buffer/Article_output/Article_output_file(oldest).json',
        './ejde_buffer/Article_output/Article_output_file(2010-2014).json',
        './ejde_buffer/Article_output/Article_output_file(2015-2020).json',
        './ejde_buffer/Article_output/Article_output_file(newest).json'
    ]
    # Read and write into files
    Read(author_folder_path, author_output_file)
    Read(article_folder_path, article_output_file)
    # End
    print("\nData has been written into files.")
 # Delete files in temporary storage area
 def delete():
    folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
    for folder_path in folder_paths:
        file_names = os.listdir(folder_path)
        for file_name in file_names:
            file_path = os.path.join(folder_path, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)
    print('\nAttention: The temporary storage files have been deleted!')
--- a/EJQTDE_spider/ejqtde_main.py
+++ b/EJQTDE_spider/ejqtde_main.py
@ -0,0 +1,113 @@
 import re
 import datetime
 import threading
 import urllib
 import ejqtde_scrawler
 import ejqtde_save
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from selenium.webdriver.edge.options import Options
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 from urllib.parse import urljoin
 '''
    爬取网站：'https://www.math.u-szeged.hu/ejqtde'
    ==========运行顺序==========
    1、ejqtde_main                      获取各年份的期刊链接
    2、ejqtde_scrawler                  抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
    3、ejqtde_save                      从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *4、ejqtde_save.delete()(可选)      删除暂存区内部所有文件（注意备份）
 '''
 # Multithread pool
 def extract_href(link):
    driver = webdriver.Edge(options=options)
    driver.get(link)
    html_code = driver.page_source
    soup = BeautifulSoup(html_code, 'html.parser')
    column_right = soup.find('div', id='columnRight')
    if column_right:
        ordered_lists = column_right.find_all('ol')
        for idx, ordered_list in enumerate(ordered_lists, 1):
            for list_item in ordered_list.find_all('li'):
                matches = re.findall(r'</a>: <a\s+href="(periodica\.html\?periodica=1&amp;'
                                     r'paramtipus_ertek=publication&amp;param_ertek=\d+)"', str(list_item))
                for match in matches:
                    URL = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', match)
                    hrefs.append(URL)
        print('Links got: ', link)
    driver.quit()
 # Empty list
 Author_list = []
 Article_list = []
 hrefs = []
 # Base web urls
 baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 current_year = datetime.datetime.now().year
 years = range(2009, 2011)            # years = range(2010, current_year + 1)
 url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
            + f'{year}' for year in years][::-1]
 # Options setting
 options = Options()
 options.add_argument('--headless')  # Run Edge in headless mode
 options.add_argument('disable-gpu')  # Disable GPU acceleration
 options.add_argument('pageLoadStrategy=none')  # Set page load strategy to 'none'
 # Use multithreading to process URLs concurrently
 with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(extract_href, url) for url in url_list]
    for future in as_completed(futures):
        pass
    wait(futures)
 print('\nAll links have been got.\n')
 # Use multithreading to get the data
 count1 = 0
 count2 = 0
 locks = threading.Lock()
 scrawl_lock = threading.Lock()
 with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(ejqtde_scrawler.scrawler, href, scrawl_lock, Article_list, Author_list) for href in hrefs]
    for future in as_completed(futures):
        if len(Article_list) >= 50:
            with locks:
                count1 += len(Article_list)
                ejqtde_save.save_data(Article_list, "Article_TS")
                Article_list.clear()
        if len(Author_list) >= 50:
            with locks:
                count2 += len(Author_list)
                ejqtde_save.save_data(Author_list, "Author_TS")
                Author_list.clear()
    wait(futures)
    # Deal with the remaining data
    if len(Article_list) > 0:
        count1 += len(Article_list)
        ejqtde_save.save_data(Article_list, "Article_TS")
        Article_list.clear()
        print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
    if len(Author_list) > 0:
        count2 += len(Author_list)
        ejqtde_save.save_data(Author_list, "Author_TS")
        Author_list.clear()
        print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')
 print('\nThe whole scrawler program has been done\n')
 print(count1, ' article_data has been stored.')
 print(count2, ' author_data has been stored.')
 # Transfer to large file and delete the temporary storage files
 ejqtde_save.Transf()
 ejqtde_save.delete()
--- a/EJQTDE_spider/ejqtde_save.py
+++ b/EJQTDE_spider/ejqtde_save.py
@ -0,0 +1,96 @@
 import os
 import json
 import uuid
 # Save into files
 def save_data(dataset, filetype):
    if dataset:
        filename = str(uuid.uuid4()) + ".json"
        directory = "./EJQTDE_buffer/" + filetype + "/"
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, filename)
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)
 # Summary files
 def Transf():
    def Read(folder_path, output_files):
        # Create new folder
        os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
        os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
        data_oldest = []
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for Dict in data:
                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # Select data
                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
                            # Transfer
                            for index in range(0, 4):
                                with open(output_files[index], 'w', encoding='utf-8') as file:
                                    json.dump(Data[index], file, indent=4)
    # The path of reading
    author_folder_path = './EJQTDE_buffer/Author_TS'
    article_folder_path = './EJQTDE_buffer/Article_TS'
    # The path of storage
    author_output_file = [
        './EJQTDE_buffer/Author_output/Author_output_file(oldest).json',
        './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json',
        './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json',
        './EJQTDE_buffer/Author_output/Author_output_file(newest).json'
    ]
    article_output_file = [
        './EJQTDE_buffer/Article_output/Article_output_file(oldest).json',
        './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json',
        './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json',
        './EJQTDE_buffer/Article_output/Article_output_file(newest).json'
    ]
    # Read and write into files
    Read(author_folder_path, author_output_file)
    Read(article_folder_path, article_output_file)
    # End
    print("\nData has been written into files.")
 # Delete files in temporary storage area
 def delete():
    folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS']
    for folder_path in folder_paths:
        file_names = os.listdir(folder_path)
        for file_name in file_names:
            file_path = os.path.join(folder_path, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)
    print('\nAttention: The temporary storage files have been deleted!')
--- a/EJQTDE_spider/ejqtde_scrawler.py
+++ b/EJQTDE_spider/ejqtde_scrawler.py
@ -0,0 +1,187 @@
 import time
 import uuid
 import re
 import urllib
 from selenium.webdriver.edge.options import Options
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 # Get the information in the webpage through selenium
 def source(driver, num):
    if driver.find_elements(by='id', value='columnRight'):
        html_code = driver.page_source
        soup = BeautifulSoup(html_code, 'html.parser')
        return soup
    elif num == 5:
        print('Out of times!')
        driver.quit()
        return None
    else:
        num += 1
        time.sleep(3)
        return source(driver, num)
 # Get the links of the authors' information
 def author_links(Data):
    Author_links = []
    Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&amp;'
                                      'paramtipus_ertek=person_data&amp;param_ertek=\d+')
    Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
    for Author_href in Author_hrefs:
        Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
        Author_links.append(Author_href)
    return Author_links
 # Get the information of the authors
 def author_detail(Data, Year, article_id, Author_list):
    # Name
    author = Data.find('p', class_='publication_head').get_text()
    author = author.split(',')
    author = [char.replace(' ', '') for char in author]
    Firstname = author[0]
    Lastname = author[-1]
    Middlename = ''.join(author[1:-1]) if author[1:-1] else None
    # infor
    table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
    Td = table.find_all('td')
    line = [td for td in Td]
    # Affiliation
    Affiliation = line[1].get_text()
    # Email
    Email = line[0].find('a').get('href')
    author_data = {
        "author_id": str(uuid.uuid4()),
        "from_article": article_id,
        "firstname": Firstname,
        "lastname": Lastname,
        "middlename": Middlename,
        "affiliation": [
            {
                "year": Year,
                "affiliation": Affiliation,
                "email": Email
            }
        ]
    }
    Author_list.append(author_data)
    return Author_list
 # Get the article's information
 def article_detail(Data, URL, article_id, Aricle_list):
    # Title
    font = Data.find('font', attrs={'size': '+1'})
    Title = font.find('b').get_text()
    # Author and Corresponding_authors
    author_pattern = re.compile(r'periodica\.html\?periodica=1&amp;'
                                r'paramtipus_ertek=person_data&amp;param_ertek=\d+"><b>(.*?)</b>')
    Author = re.findall(author_pattern, str(Data))
    Corresponding_author = Author[-1]  # Corresponding_authors
    del Author[-1]
    # Submit_datetime and publish_datetime
    time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
    time = re.findall(r'\d+-\d+-\d+', str(time))
    Submit_date = time[0] if time[0] else None
    Publish_date = time[1] if time[1] else None
    # Keyword
    Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
    Keyword = Keyword.split(', ') if Keyword is not None else None
    # MSC
    MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
    MSC = MSC.split(', ') if MSC is not None else None
    # DOI
    if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
        DOI = re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))[0]
    else:
        DOI = None
    # Publisher
    Publisher = 'www.math.u-szeged.hu/ejqtde'
    # Journal
    Journal = 'Electronic Journal of Qualitative Theory of Differential Equations'
    # Volume
    Volume = re.findall(r'<b>(\d+)</b>', str(Data))[0]
    # Issue and page
    result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
    Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
    Page = re.findall(r'\d+, (\d+-\d+)', result)[0]
    article_data = {
        "article_id": article_id,
        "title": Title,
        "authors": Author,
        "corresponding_authors": Corresponding_author,
        "submit_datetime": Submit_date,
        "publish_datetime": Publish_date,
        "keywords": Keyword,
        "MSC": MSC,
        "URL": URL,
        "DOI": DOI,
        "publisher": Publisher,
        "journal": Journal,
        "volume": Volume,
        "issue": Issue,
        "page": Page,
    }
    Aricle_list.append(article_data)
    return Aricle_list
 # Main code of scrawler
 def scrawler(URL, lock, Article_list, Author_list):
    print('Start: ', URL)
    driver = webdriver.Edge(options=options)
    driver.get(URL)
    # Enter the detail page
    Max_retryTimes = 3
    Essay_data = source(driver, Max_retryTimes)
    if Essay_data is not None:
        article_id = str(uuid.uuid4())
        Article_list = article_detail(Essay_data, URL, article_id, Article_list)
        # Get the authors' information
        Year = re.findall(r'<b>(\d+)</b>', str(Essay_data))[0]
        for author_link in author_links(Essay_data):
            driver.get(author_link)
            Author_detail = source(driver, Max_retryTimes)
            Author_list = author_detail(Author_detail, Year, article_id, Author_list)
        print('Complete: ', URL)
        driver.quit()
    else:
        print('Wrong: Some error occurred: ', URL)
        pass
 # Options setting
 options = Options()
 options.add_argument('--headless')  # Run Edge in headless mode
 options.add_argument('disable-gpu')  # Disable GPU acceleration
 options.add_argument('pageLoadStrategy=none')  # Set page load strategy to 'none'
--- a/ProjectEuclid_spider/projecteuclid_main
+++ b/ProjectEuclid_spider/projecteuclid_main
--- a/SpringerOpen_spider/SD_detail.py
+++ b/SpringerOpen_spider/SD_detail.py
@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id):
        time = time.get_text()
        Time.append(time)
-    Submitted_date = Time[0]
+    Submit_date = Time[0]
    Publish_date = Time[-1]
    # keyword
@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id):
        "title": Title,
        "authors": Author,
        "corresponding_authors": Corresponding_author,
-        "submit_datetime": Submitted_date,
+        "submit_datetime": Submit_date,
        "publish_datetime": Publish_date,
        "keywords": Keyword,
        "MSC": MSC,
--- a/SpringerOpen_spider/SD_header.py
+++ b/SpringerOpen_spider/SD_header.py
@ -1,25 +0,0 @@
 import random
 # 用户代理地址池
 uapools=[
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
 ]
 def header():
    # 网站请求头
    headers = {
        'User-Agent': random.choice(uapools),
    }
    return headers
--- a/SpringerOpen_spider/SD_link.py
+++ b/SpringerOpen_spider/SD_link.py
@ -1,6 +1,28 @@
 import random
 import requests
 from bs4 import BeautifulSoup
 # 用户代理地址池
 uapools=[
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
 ]
 def header():
    # 网站请求头
    headers = {
        'User-Agent': random.choice(uapools),
    }
    return headers
 # 标准访问格式
 def Link(url, headers):
    try:
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@ -1,6 +1,5 @@
 import urllib
 import SD_header
 import SD_link
 import SD_threads
 import SD_save
@ -20,8 +19,8 @@ from urllib.parse import urljoin
 # 存放网页链接的空列表
 Links = []          # A list for links
 Webs = []           # A list for web url
 Links = []          # A list for links
 # 存放爬取数据的空列表
 Article_data = []
@ -29,7 +28,7 @@ Author_data = []
 # ==========访问论文列表页==========
 # 获取数学类期刊网站链接
-headers = SD_header.header()
+headers = SD_link.header()
 soup = SD_link.Link('https://www.springeropen.com/journals', headers)
 hrefs = soup.find('ol', id='Mathematics-list')
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@ -20,23 +20,30 @@ def Transf():
        os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
        os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
        data_oldest = []
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for Dict in data:
                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # 筛选文章
-                    data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
-                    data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
-                    data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
-                    data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@ -3,7 +3,6 @@ import urllib
 import uuid
 from urllib.parse import urljoin
 import SD_header
 import SD_link
 import SD_detail
 import SD_save
@ -11,7 +10,7 @@ import SD_save
 # ==========获取论文详情页链接==========
 def Scrawl(Link, Article_data, Author_data):
    # 访问论文列表页
-    headers = SD_header.header()
+    headers = SD_link.header()
    soup = SD_link.Link(Link, headers)
    print(Link)