A new code for same web data merge00_File_merge
This commit is contained in:
parent
e9bdb9cdff
commit
1e98615778
108
00_File_merge/Merge.py
Normal file
108
00_File_merge/Merge.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pprint import pprint
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
def Merge(folder_path):
|
||||||
|
Data = [] # Empty list
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
Data.append(data)
|
||||||
|
|
||||||
|
count = 0 # counter
|
||||||
|
|
||||||
|
# Same file merge
|
||||||
|
for data in Data:
|
||||||
|
if len(data) > 0:
|
||||||
|
data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
|
||||||
|
for a in data:
|
||||||
|
for aa in data:
|
||||||
|
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
|
||||||
|
a.get("lastname") == aa.get("lastname"):
|
||||||
|
|
||||||
|
# Add different key-elements of "affiliation" into the first element
|
||||||
|
if a.get('affiliation') != aa.get('affiliation'):
|
||||||
|
# Uniform text formatting
|
||||||
|
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
|
||||||
|
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
|
||||||
|
if ch_1 != ch_2:
|
||||||
|
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
|
||||||
|
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
|
||||||
|
if hash_1 != hash_2:
|
||||||
|
a['affiliation'] += aa['affiliation']
|
||||||
|
|
||||||
|
# Add different key-elements of "from_article" into the first element
|
||||||
|
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
|
||||||
|
isinstance(a.get("from_article"), str) else a.get("from_article") + (
|
||||||
|
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
|
||||||
|
aa.get("from_article"))
|
||||||
|
|
||||||
|
pprint(a)
|
||||||
|
print('//////////////////////////////////////\n')
|
||||||
|
|
||||||
|
# Delete extra elements
|
||||||
|
data.remove(aa)
|
||||||
|
|
||||||
|
# Update the counter
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Different files merge
|
||||||
|
A = Data[2] # newest
|
||||||
|
B = Data[1] # (2015-2020)
|
||||||
|
C = Data[0] # (2010-2014)
|
||||||
|
D = Data[3] # oldest
|
||||||
|
|
||||||
|
Data.clear()
|
||||||
|
Data = [B, C, D]
|
||||||
|
|
||||||
|
for data in Data:
|
||||||
|
if len(data) > 0:
|
||||||
|
for a in A:
|
||||||
|
for aa in data:
|
||||||
|
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
|
||||||
|
a.get("lastname") == aa.get("lastname"):
|
||||||
|
|
||||||
|
# Add different key-elements of "affiliation" into the first element
|
||||||
|
if a.get('affiliation') != aa.get('affiliation'):
|
||||||
|
# Uniform text formatting
|
||||||
|
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
|
||||||
|
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
|
||||||
|
if ch_1 != ch_2:
|
||||||
|
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
|
||||||
|
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
|
||||||
|
if hash_1 != hash_2:
|
||||||
|
a['affiliation'] += aa['affiliation']
|
||||||
|
|
||||||
|
# Add different key-elements of "from_article" into the first element
|
||||||
|
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
|
||||||
|
isinstance(a.get("from_article"), str) else a.get("from_article") + (
|
||||||
|
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
|
||||||
|
aa.get("from_article"))
|
||||||
|
|
||||||
|
pprint(a)
|
||||||
|
print('================================\n')
|
||||||
|
|
||||||
|
# Delete extra elements
|
||||||
|
data.remove(aa)
|
||||||
|
|
||||||
|
# Update the counter
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Combined in one list
|
||||||
|
A += data
|
||||||
|
|
||||||
|
# Tips
|
||||||
|
print(str(count) + ' file(s) have been merged.')
|
||||||
|
print('There are ' + str(len(A)) + ' file(s) in total, now.')
|
||||||
|
|
||||||
|
|
||||||
|
# # input the file path here !!!
|
||||||
|
# Merge('./EJQTDE_buffer/Author_output')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -139,9 +139,9 @@ def process_article(url):
|
|||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": article_id,
|
"from_article": article_id,
|
||||||
"first_name": name[0],
|
"firstname": name[0],
|
||||||
"last_name": name[-1],
|
"lastname": name[-1],
|
||||||
"middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
|
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
105
01_EJDE_spider/ejde_merge.py
Normal file
105
01_EJDE_spider/ejde_merge.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pprint import pprint
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
def Merge(folder_path):
|
||||||
|
Data = [] # Empty list
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
Data.append(data)
|
||||||
|
|
||||||
|
count = 0 # counter
|
||||||
|
|
||||||
|
# Same file merge
|
||||||
|
for data in Data:
|
||||||
|
if len(data) > 0:
|
||||||
|
data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
|
||||||
|
for a in data:
|
||||||
|
for aa in data:
|
||||||
|
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
|
||||||
|
a.get("lastname") == aa.get("lastname"):
|
||||||
|
|
||||||
|
# Add different key-elements of "affiliation" into the first element
|
||||||
|
if a.get('affiliation') != aa.get('affiliation'):
|
||||||
|
# Uniform text formatting
|
||||||
|
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
|
||||||
|
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
|
||||||
|
if ch_1 != ch_2:
|
||||||
|
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
|
||||||
|
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
|
||||||
|
if hash_1 != hash_2:
|
||||||
|
a['affiliation'] += aa['affiliation']
|
||||||
|
|
||||||
|
# Add different key-elements of "from_article" into the first element
|
||||||
|
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
|
||||||
|
isinstance(a.get("from_article"), str) else a.get("from_article") + (
|
||||||
|
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
|
||||||
|
aa.get("from_article"))
|
||||||
|
|
||||||
|
pprint(a)
|
||||||
|
print('//////////////////////////////////////\n')
|
||||||
|
|
||||||
|
# Delete extra elements
|
||||||
|
data.remove(aa)
|
||||||
|
|
||||||
|
# Update the counter
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Different files merge
|
||||||
|
A = Data[2] # newest
|
||||||
|
B = Data[1] # (2015-2020)
|
||||||
|
C = Data[0] # (2010-2014)
|
||||||
|
D = Data[3] # oldest
|
||||||
|
|
||||||
|
Data.clear()
|
||||||
|
Data = [B, C, D]
|
||||||
|
|
||||||
|
for data in Data:
|
||||||
|
if len(data) > 0:
|
||||||
|
for a in A:
|
||||||
|
for aa in data:
|
||||||
|
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
|
||||||
|
a.get("lastname") == aa.get("lastname"):
|
||||||
|
|
||||||
|
# Add different key-elements of "affiliation" into the first element
|
||||||
|
if a.get('affiliation') != aa.get('affiliation'):
|
||||||
|
# Uniform text formatting
|
||||||
|
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
|
||||||
|
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
|
||||||
|
if ch_1 != ch_2:
|
||||||
|
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
|
||||||
|
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
|
||||||
|
if hash_1 != hash_2:
|
||||||
|
a['affiliation'] += aa['affiliation']
|
||||||
|
|
||||||
|
# Add different key-elements of "from_article" into the first element
|
||||||
|
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
|
||||||
|
isinstance(a.get("from_article"), str) else a.get("from_article") + (
|
||||||
|
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
|
||||||
|
aa.get("from_article"))
|
||||||
|
|
||||||
|
pprint(a)
|
||||||
|
print('================================\n')
|
||||||
|
|
||||||
|
# Delete extra elements
|
||||||
|
data.remove(aa)
|
||||||
|
|
||||||
|
# Update the counter
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Combined in one list
|
||||||
|
A += data
|
||||||
|
|
||||||
|
# Tips
|
||||||
|
print(str(count) + ' file(s) have been merged.')
|
||||||
|
print('There are ' + str(len(A)) + ' file(s) in total, now.')
|
||||||
|
|
||||||
|
|
||||||
|
# input the file path
|
||||||
|
Merge('./ejde_buffer/Author_output')
|
||||||
Loading…
x
Reference in New Issue
Block a user