From cfa9345a797868844ed200b458b11b801421b8f9 Mon Sep 17 00:00:00 2001
From: XCX <1361986662@qq.com>
Date: Wed, 26 Jul 2023 23:25:30 +0800
Subject: [PATCH] Update a new spider code for math.u-szeged.hu/ejqtde.
 Modified the code of SpringerOpen_spider

---
 EJQTDE_spider/ejqtde_href_multithread.py | 111 ++++++++++++++
 EJQTDE_spider/ejqtde_save.py             |  93 +++++++++++
 EJQTDE_spider/ejqtde_scrawler.py         | 187 +++++++++++++++++++++++
 SpringerOpen_spider/SD_detail.py         |   4 +-
 SpringerOpen_spider/SD_header.py         |  25 ---
 SpringerOpen_spider/SD_link.py           |  22 +++
 SpringerOpen_spider/SD_main.py           |   5 +-
 SpringerOpen_spider/SD_save.py           |  13 +-
 SpringerOpen_spider/SD_scrawl.py         |   3 +-
 9 files changed, 427 insertions(+), 36 deletions(-)
 create mode 100644 EJQTDE_spider/ejqtde_href_multithread.py
 create mode 100644 EJQTDE_spider/ejqtde_save.py
 create mode 100644 EJQTDE_spider/ejqtde_scrawler.py
 delete mode 100644 SpringerOpen_spider/SD_header.py
diff --git a/EJQTDE_spider/ejqtde_href_multithread.py b/EJQTDE_spider/ejqtde_href_multithread.py
new file mode 100644
index 0000000..311feca
--- /dev/null
+++ b/EJQTDE_spider/ejqtde_href_multithread.py
@@ -0,0 +1,111 @@
+import re
+import datetime
+import threading
+import urllib
+import ejqtde_scrawler
+import ejqtde_save
+
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from selenium.webdriver.edge.options import Options
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait
+from urllib.parse import urljoin
+
+'''
+    爬取网站：'https://www.math.u-szeged.hu/ejqtde
+
+    ==========运行顺序==========
+    1、ejqtde_href_multithread        获取各年份的期刊链接
+    2、ejqtde_scrawler                抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
+    3、ejqtde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
+    *4、ejqtde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
+'''
+
+
+# Multithread pool
+def extract_href(link):
+    driver = webdriver.Edge(options=options)
+    driver.get(link)
+    html_code = driver.page_source
+    soup = BeautifulSoup(html_code, 'html.parser')
+    column_right = soup.find('div', id='columnRight')
+    if column_right:
+        ordered_lists = column_right.find_all('ol')
+        for idx, ordered_list in enumerate(ordered_lists, 1):
+            for list_item in ordered_list.find_all('li'):
+                matches = re.findall(r'</a>: <a\s+href="(periodica\.html\?periodica=1&amp;'
+                                     r'paramtipus_ertek=publication&amp;param_ertek=\d+)"', str(list_item))
+                for match in matches:
+                    URL = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', match)
+                    hrefs.append(URL)
+
+        print('Links got: ', link)
+    driver.quit()
+
+
+# Empty list
+Author_list = []
+Article_list = []
+hrefs = []
+
+# Base web urls
+baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
+current_year = datetime.datetime.now().year
+years = range(2009, 2011)            # years = range(2010, current_year + 1)
+url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
+            + f'{year}' for year in years][::-1]
+
+# Options setting
+options = Options()
+options.add_argument('--headless')  # Run Edge in headless mode
+options.add_argument('disable-gpu')  # Disable GPU acceleration
+options.add_argument('pageLoadStrategy=none')  # Set page load strategy to 'none'
+
+# Use multithreading to process URLs concurrently
+with ThreadPoolExecutor(max_workers=25) as executor:
+    futures = [executor.submit(extract_href, url) for url in url_list]
+    for future in as_completed(futures):
+        pass
+
+    wait(futures)
+print('\nAll links have been got.\n')
+
+count1 = 0
+count2 = 0
+locks = threading.Lock()
+scrawl_lock = threading.Lock()
+
+with ThreadPoolExecutor(max_workers=25) as executor:
+    futures = [executor.submit(ejqtde_scrawler.scrawler, href, scrawl_lock, Article_list, Author_list) for href in hrefs]
+    for future in as_completed(futures):
+        if len(Article_list) >= 50:
+            with locks:
+                count1 += len(Article_list)
+                ejqtde_save.save_data(Article_list, "Article_TS")
+                Article_list.clear()
+
+        if len(Author_list) >= 50:
+            with locks:
+                count2 += len(Author_list)
+                ejqtde_save.save_data(Author_list, "Author_TS")
+                Author_list.clear()
+    wait(futures)
+
+    # Deal with the remaining data
+    if len(Article_list) > 0:
+        count1 += len(Article_list)
+        ejqtde_save.save_data(Article_list, "Article_TS")
+        Article_list.clear()
+        print('Finished: All article_data has been added to ./EJQTDE_buffer/Article_TS/')
+    if len(Author_list) > 0:
+        count2 += len(Author_list)
+        ejqtde_save.save_data(Author_list, "Author_TS")
+        Author_list.clear()
+        print('Finished: All author_data has been added to ./EJQTDE_buffer/Author_TS/')
+
+print('\nThe whole scrawler program has been done\n')
+print(count1, ' article_data has been stored.')
+print(count2, ' author_data has been stored.')
+
+ejqtde_save.Transf()
+ejqtde_save.delete()
\ No newline at end of file
diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py
new file mode 100644
index 0000000..693159a
--- /dev/null
+++ b/EJQTDE_spider/ejqtde_save.py
@@ -0,0 +1,93 @@
+import os
+import json
+import uuid
+
+
+# Save into files
+def save_data(dataset, filetype):
+    if dataset:
+        filename = str(uuid.uuid4()) + ".json"
+        directory = "./EJQTDE_buffer/" + filetype + "/"
+        os.makedirs(directory, exist_ok=True)
+        filepath = os.path.join(directory, filename)
+        with open(filepath, "w", encoding='utf-8') as json_file:
+            json.dump(dataset, json_file, indent=4)
+        print(filetype + " data have been added to", filepath)
+
+
+# Summary files
+def Transf():
+    def Read(folder_path, output_files):
+        # 新建文件夹
+        os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
+        os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)
+
+        data_oldest = []
+        data_2010_2014 = []
+        data_2015_2020 = []
+        data_newest = []
+
+        for filename in os.listdir(folder_path):
+            if filename.endswith('.json'):
+                file_path = os.path.join(folder_path, filename)
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+
+                    # Select data
+                    data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+
+                    data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+
+                    data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+
+                    data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+
+                    Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+
+                    # Transfer
+                    for index in range(0, 4):
+                        with open(output_files[index], 'w', encoding='utf-8') as file:
+                            json.dump(Data[index], file, indent=4)
+
+    # The path of reading
+    author_folder_path = './EJQTDE_buffer/Author_TS'
+    article_folder_path = './EJQTDE_buffer/Article_TS'
+
+    # The path of storage
+    author_output_file = [
+        './EJQTDE_buffer/Author_output/Author_output_file(oldest).json',
+        './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json',
+        './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json',
+        './EJQTDE_buffer/Author_output/Author_output_file(newest).json'
+    ]
+
+    article_output_file = [
+        './EJQTDE_buffer/Article_output/Article_output_file(oldest).json',
+        './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json',
+        './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json',
+        './EJQTDE_buffer/Article_output/Article_output_file(newest).json'
+    ]
+
+    # Read and write into files
+    Read(author_folder_path, author_output_file)
+    Read(article_folder_path, article_output_file)
+
+    # End
+    print("\nData has been written into files.")
+
+
+# Delete files in temporary storage area
+def delete():
+    folder_paths = ['./EJQTDE_buffer/Author_TS', './EJQTDE_buffer/Article_TS']
+    for folder_path in folder_paths:
+        file_names = os.listdir(folder_path)
+        for file_name in file_names:
+            file_path = os.path.join(folder_path, file_name)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
+
+    print('\nAttention: The temporary storage files have been deleted!')
diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py
new file mode 100644
index 0000000..1f3975d
--- /dev/null
+++ b/EJQTDE_spider/ejqtde_scrawler.py
@@ -0,0 +1,187 @@
+import time
+import uuid
+import re
+import urllib
+
+from selenium.webdriver.edge.options import Options
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+# Get the information in the webpage through selenium
+def source(driver, num):
+    if driver.find_elements(by='id', value='columnRight'):
+        html_code = driver.page_source
+        soup = BeautifulSoup(html_code, 'html.parser')
+        return soup
+    elif num == 5:
+        print('Out of times!')
+        driver.quit()
+        return None
+    else:
+        num += 1
+        time.sleep(3)
+        return source(driver, num)
+
+
+# Get the links of the authors' information
+def author_links(Data):
+    Author_links = []
+    Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&amp;'
+                                      'paramtipus_ertek=person_data&amp;param_ertek=\d+')
+    Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
+    for Author_href in Author_hrefs:
+        Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
+        Author_links.append(Author_href)
+
+    return Author_links
+
+
+# Get the information of the authors
+def author_detail(Data, Year, article_id, Author_list):
+    # Name
+    author = Data.find('p', class_='publication_head').get_text()
+
+    author = author.split(',')
+    author = [char.replace(' ', '') for char in author]
+
+    Firstname = author[0]
+    Lastname = author[-1]
+    Middlename = ''.join(author[1:-1]) if author[1:-1] else None
+
+    # infor
+    table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
+    Td = table.find_all('td')
+    line = [td for td in Td]
+
+    # Affiliation
+    Affiliation = line[1].get_text()
+
+    # Email
+    Email = line[0].find('a').get('href')
+
+    author_data = {
+        "author_id": str(uuid.uuid4()),
+        "from_article": article_id,
+        "firstname": Firstname,
+        "lastname": Lastname,
+        "middlename": Middlename,
+        "affiliation": [
+            {
+                "year": Year,
+                "affiliation": Affiliation,
+                "email": Email
+            }
+        ]
+    }
+
+    Author_list.append(author_data)
+    return Author_list
+
+
+# Get the article's information
+def article_detail(Data, URL, article_id, Aricle_list):
+    # Title
+    font = Data.find('font', attrs={'size': '+1'})
+    Title = font.find('b').get_text()
+
+    # Author and Corresponding_authors
+    author_pattern = re.compile(r'periodica\.html\?periodica=1&amp;'
+                                r'paramtipus_ertek=person_data&amp;param_ertek=\d+"><b>(.*?)</b>')
+    Author = re.findall(author_pattern, str(Data))
+    Corresponding_author = Author[-1]  # Corresponding_authors
+    del Author[-1]
+
+    # Submit_datetime and publish_datetime
+    time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
+    time = re.findall(r'\d+-\d+-\d+', str(time))
+    Submit_date = time[0] if time[0] else None
+    Publish_date = time[1] if time[1] else None
+
+    # Keyword
+    Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
+    Keyword = Keyword.split(', ') if Keyword is not None else None
+
+    # MSC
+    MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
+    MSC = MSC.split(', ') if MSC is not None else None
+
+    # DOI
+    if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
+        DOI = re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))[0]
+    else:
+        DOI = None
+
+    # Publisher
+    Publisher = 'www.math.u-szeged.hu/ejqtde'
+
+    # Journal
+    Journal = 'Electronic Journal of Qualitative Theory of Differential Equations'
+
+    # Volume
+    Volume = re.findall(r'<b>(\d+)</b>', str(Data))[0]
+
+    # Issue and page
+    result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
+    Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
+    Page = re.findall(r'\d+, (\d+-\d+)', result)[0]
+
+    article_data = {
+        "article_id": article_id,
+        "title": Title,
+        "authors": Author,
+        "corresponding_authors": Corresponding_author,
+        "submit_datetime": Submit_date,
+        "publish_datetime": Publish_date,
+        "keywords": Keyword,
+        "MSC": MSC,
+        "URL": URL,
+        "DOI": DOI,
+        "publisher": Publisher,
+        "journal": Journal,
+        "volume": Volume,
+        "issue": Issue,
+        "page": Page,
+    }
+
+    Aricle_list.append(article_data)
+    return Aricle_list
+
+
+# Main code of scrawler
+def scrawler(URL, lock, Article_list, Author_list):
+    print('Start: ', URL)
+    driver = webdriver.Edge(options=options)
+    driver.get(URL)
+
+    # Enter the detail page
+    Max_retryTimes = 3
+    Essay_data = source(driver, Max_retryTimes)
+    if Essay_data is not None:
+        article_id = str(uuid.uuid4())
+        Article_list = article_detail(Essay_data, URL, article_id, Article_list)
+
+        # Get the authors' information
+        Year = re.findall(r'<b>(\d+)</b>', str(Essay_data))[0]
+        for author_link in author_links(Essay_data):
+            driver.get(author_link)
+            Author_detail = source(driver, Max_retryTimes)
+            Author_list = author_detail(Author_detail, Year, article_id, Author_list)
+
+
+        print('Complete: ', URL)
+        driver.quit()
+
+    else:
+        print('Wrong: Some error occurred: ', URL)
+        pass
+
+
+# Options setting
+options = Options()
+options.add_argument('--headless')  # Run Edge in headless mode
+options.add_argument('disable-gpu')  # Disable GPU acceleration
+options.add_argument('pageLoadStrategy=none')  # Set page load strategy to 'none'
+
+
diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py
index 8dd02dc..73639c7 100644
--- a/SpringerOpen_spider/SD_detail.py
+++ b/SpringerOpen_spider/SD_detail.py
@@ -88,7 +88,7 @@ def Article_dict(soup, url, article_id):
         time = time.get_text()
         Time.append(time)
 
-    Submitted_date = Time[0]
+    Submit_date = Time[0]
     Publish_date = Time[-1]
 
     # keyword
@@ -132,7 +132,7 @@ def Article_dict(soup, url, article_id):
         "title": Title,
         "authors": Author,
         "corresponding_authors": Corresponding_author,
-        "submit_datetime": Submitted_date,
+        "submit_datetime": Submit_date,
         "publish_datetime": Publish_date,
         "keywords": Keyword,
         "MSC": MSC,
diff --git a/SpringerOpen_spider/SD_header.py b/SpringerOpen_spider/SD_header.py
deleted file mode 100644
index 162e82d..0000000
--- a/SpringerOpen_spider/SD_header.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import random
-
-# 用户代理地址池
-uapools=[
-    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
-    "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
-    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
-    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
-    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
-]
-
-def header():
-    # 网站请求头
-    headers = {
-        'User-Agent': random.choice(uapools),
-    }
-
-    return headers
-
-
-
diff --git a/SpringerOpen_spider/SD_link.py b/SpringerOpen_spider/SD_link.py
index a46d542..1b5eed1 100644
--- a/SpringerOpen_spider/SD_link.py
+++ b/SpringerOpen_spider/SD_link.py
@@ -1,6 +1,28 @@
+import random
 import requests
 from bs4 import BeautifulSoup
 
+# 用户代理地址池
+uapools=[
+    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36",
+    "Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" ,
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+]
+
+def header():
+    # 网站请求头
+    headers = {
+        'User-Agent': random.choice(uapools),
+    }
+
+    return headers
+
 # 标准访问格式
 def Link(url, headers):
     try:
diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py
index e94fd72..c56f993 100644
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@@ -1,6 +1,5 @@
 import urllib
 
-import SD_header
 import SD_link
 import SD_threads
 import SD_save
@@ -20,8 +19,8 @@ from urllib.parse import urljoin
 
 
 # 存放网页链接的空列表
-Links = []          # A list for links
 Webs = []           # A list for web url
+Links = []          # A list for links
 
 # 存放爬取数据的空列表
 Article_data = []
@@ -29,7 +28,7 @@ Author_data = []
 
 # ==========访问论文列表页==========
 # 获取数学类期刊网站链接
-headers = SD_header.header()
+headers = SD_link.header()
 soup = SD_link.Link('https://www.springeropen.com/journals', headers)
 
 hrefs = soup.find('ol', id='Mathematics-list')
diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py
index b088f5c..713fc76 100644
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@@ -20,6 +20,11 @@ def Transf():
         os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
         os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)
 
+        data_oldest = []
+        data_2010_2014 = []
+        data_2015_2020 = []
+        data_newest = []
+
         for filename in os.listdir(folder_path):
             if filename.endswith('.json'):
                 file_path = os.path.join(folder_path, filename)
@@ -27,16 +32,16 @@ def Transf():
                     data = json.load(file)
 
                     # 筛选文章
-                    data_oldest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                    data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                 Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
 
-                    data_2010_2014 = [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                    data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
                         Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
 
-                    data_2015_2020 = [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                    data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
                         Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
 
-                    data_newest = [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                    data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                 Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
 
                     Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py
index c79f41a..d2b8ed2 100644
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@@ -3,7 +3,6 @@ import urllib
 import uuid
 from urllib.parse import urljoin
 
-import SD_header
 import SD_link
 import SD_detail
 import SD_save
@@ -11,7 +10,7 @@ import SD_save
 # ==========获取论文详情页链接==========
 def Scrawl(Link, Article_data, Author_data):
     # 访问论文列表页
-    headers = SD_header.header()
+    headers = SD_link.header()
     soup = SD_link.Link(Link, headers)
     print(Link)