diff --git a/00_File_merge/Merge.py b/00_File_merge/Merge.py new file mode 100644 index 0000000..082bdd2 --- /dev/null +++ b/00_File_merge/Merge.py @@ -0,0 +1,108 @@ +import json +import os +from pprint import pprint +import unicodedata + + +def Merge(folder_path): + Data = [] # Empty list + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + Data.append(data) + + count = 0 # counter + + # Same file merge + for data in Data: + if len(data) > 0: + data = sorted(data, key=lambda x: x['affiliation'][0]['year']) + for a in data: + for aa in data: + if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ + a.get("lastname") == aa.get("lastname"): + + # Add different key-elements of "affiliation" into the first element + if a.get('affiliation') != aa.get('affiliation'): + # Uniform text formatting + ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') + ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') + if ch_1 != ch_2: + hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) + hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) + if hash_1 != hash_2: + a['affiliation'] += aa['affiliation'] + + # Add different key-elements of "from_article" into the first element + a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ + isinstance(a.get("from_article"), str) else a.get("from_article") + ( + [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else + aa.get("from_article")) + + pprint(a) + print('//////////////////////////////////////\n') + + # Delete extra elements + data.remove(aa) + + # Update the counter + count += 1 + + # Different files merge + A = Data[2] # newest + B = Data[1] # (2015-2020) + C = Data[0] # (2010-2014) + D = Data[3] # oldest + + Data.clear() + Data = [B, C, D] + + for data in Data: + if len(data) > 0: + for a in A: + for aa in data: + if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ + a.get("lastname") == aa.get("lastname"): + + # Add different key-elements of "affiliation" into the first element + if a.get('affiliation') != aa.get('affiliation'): + # Uniform text formatting + ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') + ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') + if ch_1 != ch_2: + hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) + hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) + if hash_1 != hash_2: + a['affiliation'] += aa['affiliation'] + + # Add different key-elements of "from_article" into the first element + a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ + isinstance(a.get("from_article"), str) else a.get("from_article") + ( + [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else + aa.get("from_article")) + + pprint(a) + print('================================\n') + + # Delete extra elements + data.remove(aa) + + # Update the counter + count += 1 + + # Combined in one list + A += data + + # Tips + print(str(count) + ' file(s) have been merged.') + print('There are ' + str(len(A)) + ' file(s) in total, now.') + + +# # input the file path here !!! +# Merge('./EJQTDE_buffer/Author_output') + + + diff --git a/EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py similarity index 97% rename from EJDE_spider/ejde_main.py rename to 01_EJDE_spider/ejde_main.py index d7d6450..905f9eb 100644 --- a/EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -139,9 +139,9 @@ def process_article(url): author_data = { "author_id": str(uuid.uuid4()), "from_article": article_id, - "first_name": name[0], - "last_name": name[-1], - "middle_name": name[1:len(name) - 1] if len(name) > 2 else None, + "firstname": name[0], + "lastname": name[-1], + "middlename": name[1:len(name) - 1] if len(name) > 2 else None, "affiliation": [{ "year": volume, "affiliation": affiliation, diff --git a/01_EJDE_spider/ejde_merge.py b/01_EJDE_spider/ejde_merge.py new file mode 100644 index 0000000..d3bcaf1 --- /dev/null +++ b/01_EJDE_spider/ejde_merge.py @@ -0,0 +1,105 @@ +import json +import os +from pprint import pprint +import unicodedata + + +def Merge(folder_path): + Data = [] # Empty list + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + Data.append(data) + + count = 0 # counter + + # Same file merge + for data in Data: + if len(data) > 0: + data = sorted(data, key=lambda x: x['affiliation'][0]['year']) + for a in data: + for aa in data: + if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ + a.get("lastname") == aa.get("lastname"): + + # Add different key-elements of "affiliation" into the first element + if a.get('affiliation') != aa.get('affiliation'): + # Uniform text formatting + ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') + ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') + if ch_1 != ch_2: + hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) + hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) + if hash_1 != hash_2: + a['affiliation'] += aa['affiliation'] + + # Add different key-elements of "from_article" into the first element + a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ + isinstance(a.get("from_article"), str) else a.get("from_article") + ( + [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else + aa.get("from_article")) + + pprint(a) + print('//////////////////////////////////////\n') + + # Delete extra elements + data.remove(aa) + + # Update the counter + count += 1 + + # Different files merge + A = Data[2] # newest + B = Data[1] # (2015-2020) + C = Data[0] # (2010-2014) + D = Data[3] # oldest + + Data.clear() + Data = [B, C, D] + + for data in Data: + if len(data) > 0: + for a in A: + for aa in data: + if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ + a.get("lastname") == aa.get("lastname"): + + # Add different key-elements of "affiliation" into the first element + if a.get('affiliation') != aa.get('affiliation'): + # Uniform text formatting + ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') + ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') + if ch_1 != ch_2: + hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) + hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) + if hash_1 != hash_2: + a['affiliation'] += aa['affiliation'] + + # Add different key-elements of "from_article" into the first element + a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ + isinstance(a.get("from_article"), str) else a.get("from_article") + ( + [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else + aa.get("from_article")) + + pprint(a) + print('================================\n') + + # Delete extra elements + data.remove(aa) + + # Update the counter + count += 1 + + # Combined in one list + A += data + + # Tips + print(str(count) + ' file(s) have been merged.') + print('There are ' + str(len(A)) + ' file(s) in total, now.') + + +# input the file path +Merge('./ejde_buffer/Author_output') \ No newline at end of file diff --git a/EJDE_spider/ejde_save.py b/01_EJDE_spider/ejde_save.py similarity index 100% rename from EJDE_spider/ejde_save.py rename to 01_EJDE_spider/ejde_save.py diff --git a/EJQTDE_spider/ejqtde_main.py b/02_EJQTDE_spider/ejqtde_main.py similarity index 100% rename from EJQTDE_spider/ejqtde_main.py rename to 02_EJQTDE_spider/ejqtde_main.py diff --git a/EJQTDE_spider/ejqtde_save.py b/02_EJQTDE_spider/ejqtde_save.py similarity index 100% rename from EJQTDE_spider/ejqtde_save.py rename to 02_EJQTDE_spider/ejqtde_save.py diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/02_EJQTDE_spider/ejqtde_scrawler.py similarity index 100% rename from EJQTDE_spider/ejqtde_scrawler.py rename to 02_EJQTDE_spider/ejqtde_scrawler.py diff --git a/ProjectEuclid_spider/projecteuclid_main b/03_ProjectEuclid_spider/projecteuclid_main similarity index 100% rename from ProjectEuclid_spider/projecteuclid_main rename to 03_ProjectEuclid_spider/projecteuclid_main diff --git a/SpringerOpen_spider/SD_detail.py b/04_SpringerOpen_spider/SD_detail.py similarity index 100% rename from SpringerOpen_spider/SD_detail.py rename to 04_SpringerOpen_spider/SD_detail.py diff --git a/SpringerOpen_spider/SD_link.py b/04_SpringerOpen_spider/SD_link.py similarity index 100% rename from SpringerOpen_spider/SD_link.py rename to 04_SpringerOpen_spider/SD_link.py diff --git a/SpringerOpen_spider/SD_main.py b/04_SpringerOpen_spider/SD_main.py similarity index 100% rename from SpringerOpen_spider/SD_main.py rename to 04_SpringerOpen_spider/SD_main.py diff --git a/SpringerOpen_spider/SD_save.py b/04_SpringerOpen_spider/SD_save.py similarity index 100% rename from SpringerOpen_spider/SD_save.py rename to 04_SpringerOpen_spider/SD_save.py diff --git a/SpringerOpen_spider/SD_scrawl.py b/04_SpringerOpen_spider/SD_scrawl.py similarity index 100% rename from SpringerOpen_spider/SD_scrawl.py rename to 04_SpringerOpen_spider/SD_scrawl.py diff --git a/SpringerOpen_spider/SD_threads.py b/04_SpringerOpen_spider/SD_threads.py similarity index 100% rename from SpringerOpen_spider/SD_threads.py rename to 04_SpringerOpen_spider/SD_threads.py diff --git a/WorldScientific b/05_WorldScientific_spider/WorldScientific similarity index 100% rename from WorldScientific rename to 05_WorldScientific_spider/WorldScientific