Fix bugs and add new code to search author data without email information

2023-09-20 23:29:42 +08:00 · 2023-09-20 23:29:42 +08:00 · 2f6f86a48e
commit 2f6f86a48e
parent 2a1fcfc4cd
2 changed files with 101 additions and 9 deletions
--- a/00_File_merge/Merge.py
+++ b/00_File_merge/Merge.py
@ -2,13 +2,12 @@ import json
 import os
 import re
 import time
-
 import unicodedata
+import torch
+
 from pprint import pprint
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
-
 from transformers import BertTokenizer, BertModel
-import torch
 from sklearn.metrics.pairwise import cosine_similarity

 '''
@ -43,8 +42,11 @@ from sklearn.metrics.pairwise import cosine_similarity
           
           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
-           2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比（比较
-           复杂，暂时没有好思路）
+           2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比（已解
+              决，通过删除多余空格，全部标点符号，再进行相似度比较）
+           3. 相似度比较需要一个更加快捷的方式，通过预置的模型对比耗时过长，cpu
+              占用率也较高
+              
 '''


@ -61,7 +63,7 @@ def SameWeb_merge(folder_path):
                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
                                "middle_name") and a.get("last_name") == aa.get("last_name"):
-                            if a['from_article'] == aa['from_article']:
+                            if a['from_article'] == aa['from_article']:     # Remove same data
                                Data.remove(Data[j])
                                count2[0] += 1
                                return SameName_merge(i, count1, count2, Data, ml)
@ -165,7 +167,7 @@ def SameWeb_merge(folder_path):
                                text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
                                text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()

-                                # Delete space
+                                # Delete extra spaces
                                text1 = re.sub(r'\s+', ' ', text1).strip()
                                text2 = re.sub(r'\s+', ' ', text2).strip()

@ -211,6 +213,7 @@ def SameWeb_merge(folder_path):
    num1 = 0            # Unique data counter
    num2 = 0            # Complete merged data counter
    num3 = 0            # Incomplete merged data counter
+    num4 = 0            # Similarity algorithm merged data counter

    # Add data into list
    for filename in os.listdir(folder_path):
@ -262,9 +265,10 @@ def SameWeb_merge(folder_path):
    # Combine Data with temp_list
    for i in temp_list:
        if len(i) == 1:
-            Data.append(i[0])
+            num4 += 1
+            Data.insert(0, i[0])
        else:
-            Data.append(i)
+            Data.insert(-1, i)

    print('\n----- Similarity algorithm merge complete -----\n')

@ -287,6 +291,7 @@ def SameWeb_merge(folder_path):

    print(str(num1) + ' copies of data are unique.')
    print(str(num2) + ' copies of data are complete merged')
+    print(str(num4) + ' copies of data are complete merged by similarity algorithm')
    print(str(num3) + ' copies of data are incomplete merged')

    # Save into file
--- a/00_File_merge/fileReader.py
+++ b/00_File_merge/fileReader.py
@ -0,0 +1,87 @@
+import os
+import json
+
+'''
+    ========== fileReader =========
+    1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
+    2. 通过检索作者信息获取 author_id 和 from_article，并且返回作者信息中检索来源文章的 title，
+       将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
+    3. 将 ar_list 作为结果返回。
+'''
+
+
+# Function to find the author data which does not have "email" information
+def Read(author_path, article_path):
+    # Read data list
+    def au_read(path, file_names, list):
+        for file_name in file_names:
+            file_path = os.path.join(path, file_name)
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+                for Dict in range(len(data)-1, -1, -1):
+                    if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
+                        list.append(data[Dict])
+                        # del data[Dict]
+        return list
+
+    def ar_read(path, file_name, list, ar_id, au_id):
+        file_path = os.path.join(path, file_name)
+        with open(file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+            for Dict in data:
+                if Dict.get('article_id') == ar_id:
+                    # A new dictionary to stored key information
+                    temp_data = {
+                        'title': Dict.get('title'),
+                        'author_id': au_id
+                    }
+
+                    list.append(temp_data)  # Add into list
+
+        return list
+
+    # ========== Main code ==========
+    au_list = []    # List for author data
+    ar_list = []    # List for article data
+    ar_temp = []    # List for temp stored
+    num = 0         # Data number counter
+
+    # Read the file
+    au_names = os.listdir(author_path)
+    ar_names = os.listdir(article_path)
+
+    # Stored the author data which has no "email" information
+    au_list = au_read(author_path, au_names, au_list)
+
+    # Search the articles where the authors from
+    for au_data in au_list:
+        if len(ar_temp) == 100:
+            num += 100
+            ar_list.append(ar_temp)
+            ar_temp.clear()
+
+            print(str(num) + " copies of data has been stored.")
+
+        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
+            ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+
+        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
+            ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+
+        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
+            ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+
+        else:
+            ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+
+    if len(ar_temp) > 0:            # Stored remaining data
+        ar_list.append(ar_temp)
+
+    print(len(ar_list))
+    return ar_list
+
+
+# ========== Test code ==========
+# Read('./test_buffer/Author_output', './test_buffer/Article_output')
+
+