A new code for same web data merge00_File_merge

2023-08-06 19:42:43 +08:00 · 2023-08-06 19:42:43 +08:00 · 1e98615778
commit 1e98615778
parent e9bdb9cdff
15 changed files with 216 additions and 3 deletions
--- a/00_File_merge/Merge.py
+++ b/00_File_merge/Merge.py
@ -0,0 +1,108 @@
+import json
+import os
+from pprint import pprint
+import unicodedata
+
+
+def Merge(folder_path):
+    Data = []       # Empty list
+
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+                Data.append(data)
+
+    count = 0       # counter
+
+    # Same file merge
+    for data in Data:
+        if len(data) > 0:
+            data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
+            for a in data:
+                for aa in data:
+                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
+                            a.get("lastname") == aa.get("lastname"):
+
+                        # Add different key-elements of "affiliation" into the first element
+                        if a.get('affiliation') != aa.get('affiliation'):
+                            # Uniform text formatting
+                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
+                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
+                            if ch_1 != ch_2:
+                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
+                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
+                                if hash_1 != hash_2:
+                                    a['affiliation'] += aa['affiliation']
+
+                                    # Add different key-elements of "from_article" into the first element
+                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
+                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
+                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
+                                        aa.get("from_article"))
+
+                                    pprint(a)
+                                    print('//////////////////////////////////////\n')
+
+                        # Delete extra elements
+                        data.remove(aa)
+
+                        # Update the counter
+                        count += 1
+
+    # Different files merge
+    A = Data[2]     # newest
+    B = Data[1]     # (2015-2020)
+    C = Data[0]     # (2010-2014)
+    D = Data[3]     # oldest
+
+    Data.clear()
+    Data = [B, C, D]
+
+    for data in Data:
+        if len(data) > 0:
+            for a in A:
+                for aa in data:
+                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
+                            a.get("lastname") == aa.get("lastname"):
+
+                        # Add different key-elements of "affiliation" into the first element
+                        if a.get('affiliation') != aa.get('affiliation'):
+                            # Uniform text formatting
+                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
+                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
+                            if ch_1 != ch_2:
+                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
+                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
+                                if hash_1 != hash_2:
+                                    a['affiliation'] += aa['affiliation']
+
+                                    # Add different key-elements of "from_article" into the first element
+                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
+                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
+                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
+                                        aa.get("from_article"))
+
+                                pprint(a)
+                                print('================================\n')
+
+                        # Delete extra elements
+                        data.remove(aa)
+
+                        # Update the counter
+                        count += 1
+
+            # Combined in one list
+            A += data
+
+    # Tips
+    print(str(count) + ' file(s) have been merged.')
+    print('There are ' + str(len(A)) + ' file(s) in total, now.')
+
+
+# # input the file path here !!!
+# Merge('./EJQTDE_buffer/Author_output')
+
+
+
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -139,9 +139,9 @@ def process_article(url):
                author_data = {
                    "author_id": str(uuid.uuid4()),
                    "from_article": article_id,
-                    "first_name": name[0],
-                    "last_name": name[-1],
-                    "middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
+                    "firstname": name[0],
+                    "lastname": name[-1],
+                    "middlename": name[1:len(name) - 1] if len(name) > 2 else None,
                    "affiliation": [{
                        "year": volume,
                        "affiliation": affiliation,
--- a/01_EJDE_spider/ejde_merge.py
+++ b/01_EJDE_spider/ejde_merge.py
@ -0,0 +1,105 @@
+import json
+import os
+from pprint import pprint
+import unicodedata
+
+
+def Merge(folder_path):
+    Data = []       # Empty list
+
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+                Data.append(data)
+
+    count = 0       # counter
+
+    # Same file merge
+    for data in Data:
+        if len(data) > 0:
+            data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
+            for a in data:
+                for aa in data:
+                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
+                            a.get("lastname") == aa.get("lastname"):
+
+                        # Add different key-elements of "affiliation" into the first element
+                        if a.get('affiliation') != aa.get('affiliation'):
+                            # Uniform text formatting
+                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
+                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
+                            if ch_1 != ch_2:
+                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
+                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
+                                if hash_1 != hash_2:
+                                    a['affiliation'] += aa['affiliation']
+
+                                    # Add different key-elements of "from_article" into the first element
+                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
+                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
+                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
+                                        aa.get("from_article"))
+
+                                    pprint(a)
+                                    print('//////////////////////////////////////\n')
+
+                        # Delete extra elements
+                        data.remove(aa)
+
+                        # Update the counter
+                        count += 1
+
+    # Different files merge
+    A = Data[2]     # newest
+    B = Data[1]     # (2015-2020)
+    C = Data[0]     # (2010-2014)
+    D = Data[3]     # oldest
+
+    Data.clear()
+    Data = [B, C, D]
+
+    for data in Data:
+        if len(data) > 0:
+            for a in A:
+                for aa in data:
+                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
+                            a.get("lastname") == aa.get("lastname"):
+
+                        # Add different key-elements of "affiliation" into the first element
+                        if a.get('affiliation') != aa.get('affiliation'):
+                            # Uniform text formatting
+                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
+                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
+                            if ch_1 != ch_2:
+                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
+                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
+                                if hash_1 != hash_2:
+                                    a['affiliation'] += aa['affiliation']
+
+                                    # Add different key-elements of "from_article" into the first element
+                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
+                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
+                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
+                                        aa.get("from_article"))
+
+                                pprint(a)
+                                print('================================\n')
+
+                        # Delete extra elements
+                        data.remove(aa)
+
+                        # Update the counter
+                        count += 1
+
+            # Combined in one list
+            A += data
+
+    # Tips
+    print(str(count) + ' file(s) have been merged.')
+    print('There are ' + str(len(A)) + ' file(s) in total, now.')
+
+
+# input the file path
+Merge('./ejde_buffer/Author_output')
--- a/01_EJDE_spider/ejde_save.py
+++ b/01_EJDE_spider/ejde_save.py
--- a/02_EJQTDE_spider/ejqtde_main.py
+++ b/02_EJQTDE_spider/ejqtde_main.py
--- a/02_EJQTDE_spider/ejqtde_save.py
+++ b/02_EJQTDE_spider/ejqtde_save.py
--- a/02_EJQTDE_spider/ejqtde_scrawler.py
+++ b/02_EJQTDE_spider/ejqtde_scrawler.py
--- a/03_ProjectEuclid_spider/projecteuclid_main
+++ b/03_ProjectEuclid_spider/projecteuclid_main
--- a/04_SpringerOpen_spider/SD_detail.py
+++ b/04_SpringerOpen_spider/SD_detail.py
--- a/04_SpringerOpen_spider/SD_link.py
+++ b/04_SpringerOpen_spider/SD_link.py
--- a/04_SpringerOpen_spider/SD_main.py
+++ b/04_SpringerOpen_spider/SD_main.py
--- a/04_SpringerOpen_spider/SD_save.py
+++ b/04_SpringerOpen_spider/SD_save.py
--- a/04_SpringerOpen_spider/SD_scrawl.py
+++ b/04_SpringerOpen_spider/SD_scrawl.py
--- a/04_SpringerOpen_spider/SD_threads.py
+++ b/04_SpringerOpen_spider/SD_threads.py
--- a/05_WorldScientific_spider/WorldScientific
+++ b/05_WorldScientific_spider/WorldScientific