Modified old code

2023-07-27 10:26:02 +08:00 · 2023-07-27 10:26:02 +08:00 · 26fed37e17
commit 26fed37e17
parent cfa9345a79
7 changed files with 324 additions and 91 deletions
--- a/EJDE_spider/Transf.py
+++ b/EJDE_spider/Transf.py
@ -1,38 +0,0 @@
-import os
-import json
-
-# Function
-# Get the data from input files
-def Read(folder_path):
-    data = []
-
-    for filename in os.listdir(folder_path):
-        if filename.endswith('.json'):
-            file_path = os.path.join(folder_path, filename)
-            with open(file_path, 'r', encoding='utf-8') as file:
-                data.extend(json.load(file))
-    return data
-
-# Write into output files
-def Write(data, output_file):
-    with open(output_file, 'w', encoding='utf-8') as file:
-        json.dump(data, file, indent=4)
-
-# Path of files need to be read
-folder_path1 = '.\ejde_buffer\Author'
-folder_path2 = '.\ejde_buffer\Article'
-
-# Read the data in the files
-Author_data = Read(folder_path1)
-Article_data = Read(folder_path2)
-
-# The path of output files
-output_file1 = '.\ejde_buffer\Author_output_file.json'
-output_file2 = '.\ejde_buffer\Article_output_file.json'
-
-# Write into files
-Write(Author_data, output_file1)
-Write(Article_data, output_file2)
-
-# End
-print("\nData has been written into files.")
--- a/EJDE_spider/ejde_scrawler.py
+++ b/EJDE_spider/ejde_scrawler.py
@ -1,23 +1,22 @@
-import os
 import uuid
 import requests
-from bs4 import BeautifulSoup
 import re
-import json
+import ejde_save
+
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from retrying import retry
+from bs4 import BeautifulSoup

+'''
+    爬取网站：'ejde.math.txstate.edu'

-def save_data(dataset, filetype, filename):
-    if dataset:
-        directory = "./ejde_buffer/" + filetype + "/"
-        os.makedirs(directory, exist_ok=True)
-        filepath = os.path.join(directory, filename)
-        with open(filepath, "w", encoding='utf-8') as json_file:
-            json.dump(dataset, json_file, indent=4)
-        print(filetype + " data have been added to", filepath)
-
+    ==========运行顺序==========
+    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
+    2、ejde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
+    *3、ejde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
+'''

+# Article and author detail
@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(url):
    response = requests.get(url)
@ -43,7 +42,7 @@ def process_article(url):

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
-        volume = volume_match.group(1) if volume_match else None
+        volume = str(volume_match.group(1)) if volume_match else None
        # year = volume_match.group(2) if volume_match else None

        # Extract pp
@ -141,11 +140,11 @@ def process_article(url):

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
-            save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
+            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
            articleData.clear()

        if len(authorData) % batch_size == 0:
-            save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
+            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
            authorData.clear()


@ -162,7 +161,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
 authorData = []
 articleData = []

-batch_size = 500  # Number of articles to process before saving
+batch_size = 5  # Number of articles to process before saving
 executor = ThreadPoolExecutor(max_workers=20)  # Set the number of worker threads

 # Process each URL using multithreading
@ -176,10 +175,14 @@ for future in as_completed(futures):
        print("An error occurred:", str(e))

 # Save remaining data
-if articleData:
-    save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
-    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
+if len(articleData) > 0:
+    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")

-if authorData:
-    save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
-    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
+if len(authorData) > 0:
+    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
+
+# Transfer to large file and delete the temporary storage files
+ejde_save.Transf()
+ejde_save.delete()
--- a/EJDE_spider/ejde_save.py
+++ b/EJDE_spider/ejde_save.py
@ -0,0 +1,93 @@
+import os
+import json
+
+
+# Save data
+def save_data(dataset, filetype, filename):
+    if dataset:
+        directory = "./ejde_buffer/" + filetype + "/"
+        os.makedirs(directory, exist_ok=True)
+        filepath = os.path.join(directory, filename)
+        with open(filepath, "w", encoding='utf-8') as json_file:
+            json.dump(dataset, json_file, indent=4)
+        print(filetype + " data have been added to", filepath)
+
+
+# Write into output files
+def Transf():
+    def Read(folder_path, output_files):
+        # Create new folders
+        os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
+        os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
+
+        data_oldest = []
+        data_2010_2014 = []
+        data_2015_2020 = []
+        data_newest = []
+
+        for filename in os.listdir(folder_path):
+            if filename.endswith('.json'):
+                file_path = os.path.join(folder_path, filename)
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+
+                for Dict in data:
+                    if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        # Select data
+                        data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+
+                        data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+
+                        data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+
+                        data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+
+                        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+
+                        # Transfer
+                        for index in range(0, 4):
+                            with open(output_files[index], 'w', encoding='utf-8') as file:
+                                json.dump(Data[index], file, indent=4)
+
+    # The path of reading
+    author_folder_path = './ejde_buffer/Author_TS'
+    article_folder_path = './ejde_buffer/Article_TS'
+
+    # The path of storage
+    author_output_file = [
+        './ejde_buffer/Author_output/Author_output_file(oldest).json',
+        './ejde_buffer/Author_output/Author_output_file(2010-2014).json',
+        './ejde_buffer/Author_output/Author_output_file(2015-2020).json',
+        './ejde_buffer/Author_output/Author_output_file(newest).json'
+    ]
+
+    article_output_file = [
+        './ejde_buffer/Article_output/Article_output_file(oldest).json',
+        './ejde_buffer/Article_output/Article_output_file(2010-2014).json',
+        './ejde_buffer/Article_output/Article_output_file(2015-2020).json',
+        './ejde_buffer/Article_output/Article_output_file(newest).json'
+    ]
+
+    # Read and write into files
+    Read(author_folder_path, author_output_file)
+    Read(article_folder_path, article_output_file)
+
+    # End
+    print("\nData has been written into files.")
+
+
+# Delete files in temporary storage area
+def delete():
+    folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
+    for folder_path in folder_paths:
+        file_names = os.listdir(folder_path)
+        for file_name in file_names:
+            file_path = os.path.join(folder_path, file_name)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
+
+    print('\nAttention: The temporary storage files have been deleted!')
--- a/EJQTDE_spider/ejqtde_href_multithread.py
+++ b/EJQTDE_spider/ejqtde_href_multithread.py
@ -12,10 +12,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 from urllib.parse import urljoin

 '''
-    爬取网站：'https://www.math.u-szeged.hu/ejqtde
+    爬取网站：'https://www.math.u-szeged.hu/ejqtde'

    ==========运行顺序==========
-    1、ejqtde_href_multithread        获取各年份的期刊链接
+    1、ejqtde_main       获取各年份的期刊链接
    2、ejqtde_scrawler                抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
    3、ejqtde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *4、ejqtde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
@ -70,6 +70,7 @@ with ThreadPoolExecutor(max_workers=25) as executor:
    wait(futures)
 print('\nAll links have been got.\n')

+# Use multithreading to get the data
 count1 = 0
 count2 = 0
 locks = threading.Lock()
@ -107,5 +108,6 @@ print('\nThe whole scrawler program has been done\n')
 print(count1, ' article_data has been stored.')
 print(count2, ' author_data has been stored.')

+# Transfer to large file and delete the temporary storage files
 ejqtde_save.Transf()
 ejqtde_save.delete()
--- a/EJQTDE_spider/ejqtde_save.py
+++ b/EJQTDE_spider/ejqtde_save.py
@ -18,7 +18,7 @@ def save_data(dataset, filetype):
 # Summary files
 def Transf():
    def Read(folder_path, output_files):
-        # 新建文件夹
+        # Create new folder
        os.makedirs('./EJQTDE_buffer/Article_output/', exist_ok=True)
        os.makedirs('./EJQTDE_buffer/Author_output/', exist_ok=True)

@ -33,6 +33,8 @@ def Transf():
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

+                    for Dict in data:
+                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # Select data
                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
@ -91,3 +93,4 @@ def delete():
                os.remove(file_path)

    print('\nAttention: The temporary storage files have been deleted!')
+
--- a/ProjectEuclid_spider/projecteuclid_main
+++ b/ProjectEuclid_spider/projecteuclid_main
@ -0,0 +1,168 @@
+import requests
+from bs4 import BeautifulSoup,Tag
+import json
+import re
+import uuid
+
+main_page_urls = [
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-11_2f_12",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-9_2f_10",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-7_2f_8",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-5_2f_6",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-3_2f_4",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-36/issue-1_2f_2",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-11_2f_12",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-9_2f_10",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-7_2f_8",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-5_2f_6",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-3_2f_4",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-35/issue-1_2f_2",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-11_2f_12",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-9_2f_10",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-7_2f_8",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-5_2f_6",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-3_2f_4",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-34/issue-1_2f_2",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-11_2f_12",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-9_2f_10",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-7_2f_8",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-5_2f_6",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-3_2f_4",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-33/issue-1_2f_2",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-11_2f_12",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-9_2f_10",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-7_2f_8",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-5_2f_6",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-3_2f_4",
+    "https://projecteuclid.org/journals/differential-and-integral-equations/volume-32/issue-1_2f_2"
+    
+    
+]
+
+all_d_list = []
+
+# 遍历多个主页面的链接
+for main_page_url in main_page_urls:
+    response = requests.get(main_page_url)
+    html = response.text
+    soup = BeautifulSoup(html, "html.parser")
+
+    pattern = re.compile(r'^/journals/differential-and-integral-equations/')
+    links = soup.find_all("a", href=pattern)
+    sub_urls = [link["href"] for link in links if link["href"].endswith(".full")]
+
+    # 访问子链接并进行信息提取
+    for sub_url in sub_urls:
+        full_sub_url = "https://projecteuclid.org" + sub_url
+        sub_response = requests.get(full_sub_url)
+        sub_html = sub_response.text
+
+        # 执行子界面上的信息提取
+        sub_soup = BeautifulSoup(sub_html, "html.parser")
+
+        #寻找作者
+        author_tags = sub_soup.find_all('meta', {'name': 'citation_author'})
+
+        authors = {}  # 用于存储作者信息的字典
+
+        #对每一个的作者信息进行处理
+        for i, tag in enumerate(author_tags, 1):
+            citation_author = tag['content']
+            authors[i] = citation_author if citation_author else None
+
+        #寻找文章的基本信息
+        titles = []
+        for title in sub_soup.find_all('meta',{'name':'citation_title'}):
+            if title.get('content') is not None:
+                titles.append(title.get('content'))
+                
+
+
+
+        #寻找发布时间
+        publish_times = []
+        for publish_time in sub_soup.find_all('meta',{'name':'publish_date'}):
+            if publish_time.get('content'):
+                publish_times.append(str(publish_time.get('content')))
+            else:
+                publish_time.append('None')
+            
+
+        #寻找关键词
+        keywords_list=[]   
+        for keywords in sub_soup.find_all('meta',{'name':'citation_keywords'}):
+            if keywords.get('content'):
+                keywords_list.append(keywords.get('content'))
+            else:
+                keywords_list.append('None')
+
+            
+        #寻找doi
+        dois = []
+        for doi in sub_soup.find_all('meta', {'name': 'citation_doi'}):
+            dois.append(str(doi.get('content')))
+        doi_separated = ";\n".join(dois)      
+
+
+        #寻找volume
+        volumes=[]    
+        for volume in sub_soup.find_all('meta',{'name':'citation_volume'}):
+            if volume.get('content'):
+                volumes.append(volume.get('content'))
+            else:
+                volumes.append('None')
+        volume_separated = ";\n".join(volumes)  
+                
+        #寻找issue
+        issues=[]   
+        for issue in sub_soup.find_all('meta',{'name':'citation_issue'}):
+        
+            issues.append(issue.get('content'))
+        issue_separated = ";\n".join(issues)    
+
+            
+        #寻找首页
+        firstpages=[]
+        for firstpage in sub_soup.find_all('meta',{'name':'citation_firstpage'}):
+            firstpages.append(firstpage.get('content'))
+            
+
+        #寻找尾页
+        lastpages=[]
+        for lastpage in sub_soup.find_all('meta',{'name':'citation_lastpage'}):
+            lastpages.append(lastpage.get('content'))
+            
+        #寻找MSC
+        MSC=[]
+        for msc in sub_soup.find_all('meta',{'name':'dc.Subject'}):
+            MSC.append(msc.get('content'))
+        MSC_separated = ";\n".join(MSC)
+
+        all_d={"article_id:":str(uuid.uuid4()),
+               "Author":authors,
+               "correspond_author":"null",
+               "Title":titles,
+               "Publish Time":publish_times,
+               "keywords":keywords_list,
+               "DOI":doi_separated ,
+               "volume":volume_separated ,
+               "issue":issue_separated,
+               "url":full_sub_url,
+               "page": "-".join(firstpages) + "-" + "-".join(lastpages),
+               "journal":"projecteuclid.org",
+               "MSC":MSC_separated}    
+        #print(all_d)
+        # 写入JSON文件
+
+        all_d_list.append(all_d)
+
+            # 将信息存储到列表中
+            # all_d_list.append(...)
+
+# 输出存储的信息
+# print(all_d_list)
+with open('articles.json', 'w') as f:
+    json.dump(all_d_list, f, indent=2)
+
+print("JSON文件已成功生成。")
+
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@ -31,6 +31,8 @@ def Transf():
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

+                    for Dict in data:
+                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # 筛选文章
                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]