Updated ejde parser format
Fixed duplicate data dumping problem Pushed new "ejde_buffer.zip"
This commit is contained in:
parent
50e30e105b
commit
ad63bcf6c4
Binary file not shown.
@ -26,11 +26,12 @@ payload = {
|
|||||||
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
||||||
|
|
||||||
# Aminer API
|
# Aminer API
|
||||||
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||||
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||||
|
api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
|
||||||
|
|
||||||
|
|
||||||
def aminer_get_id(title):
|
def aminer_get_paper_id(title):
|
||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Bearer {jwt_token}"
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
}
|
}
|
||||||
@ -39,7 +40,7 @@ def aminer_get_id(title):
|
|||||||
"size": "",
|
"size": "",
|
||||||
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
||||||
}
|
}
|
||||||
response = requests.get(api_get_id, headers=headers, params=params)
|
response = requests.get(api_paper_id, headers=headers, params=params)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
@ -49,7 +50,7 @@ def aminer_get_id(title):
|
|||||||
not_on_aminer.append(title)
|
not_on_aminer.append(title)
|
||||||
|
|
||||||
|
|
||||||
def aminer_post_citation(aminer_id):
|
def aminer_post_paper_citation(aminer_id):
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json;charset=utf-8",
|
"Content-Type": "application/json;charset=utf-8",
|
||||||
"Authorization": f"Bearer {jwt_token}"
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
@ -57,7 +58,7 @@ def aminer_post_citation(aminer_id):
|
|||||||
request_data = {
|
request_data = {
|
||||||
"ids": aminer_id
|
"ids": aminer_id
|
||||||
}
|
}
|
||||||
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
|
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
@ -72,6 +73,31 @@ def aminer_post_citation(aminer_id):
|
|||||||
aminer_paper_citation_retry.append(aminer_id)
|
aminer_paper_citation_retry.append(aminer_id)
|
||||||
|
|
||||||
|
|
||||||
|
def aminer_author_info(author_aminer_id, author_name, offset):
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json;charset=utf-8",
|
||||||
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
|
}
|
||||||
|
request_data = {
|
||||||
|
"ids": author_aminer_id,
|
||||||
|
"query": author_name,
|
||||||
|
"offset": offset
|
||||||
|
}
|
||||||
|
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data.get("success"):
|
||||||
|
for item in data.get('data', []):
|
||||||
|
if 'n_citation' in item:
|
||||||
|
n_citation = item['n_citation']
|
||||||
|
else:
|
||||||
|
n_citation = 0
|
||||||
|
aminer_paper_citation.append(n_citation)
|
||||||
|
else:
|
||||||
|
aminer_paper_citation_retry.append(author_aminer_id)
|
||||||
|
|
||||||
|
|
||||||
def scholarly_get_citation(title):
|
def scholarly_get_citation(title):
|
||||||
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
||||||
pg = ProxyGenerator()
|
pg = ProxyGenerator()
|
||||||
@ -92,8 +118,7 @@ aminer_paper_citation = []
|
|||||||
aminer_paper_citation_retry = []
|
aminer_paper_citation_retry = []
|
||||||
|
|
||||||
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
||||||
|
aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||||
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
|
||||||
if aminer_paper_id:
|
if aminer_paper_id:
|
||||||
aminer_post_citation(aminer_paper_id)
|
aminer_post_paper_citation(aminer_paper_id)
|
||||||
print(aminer_paper_citation)
|
print(aminer_paper_citation)
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import requests
|
import requests
|
||||||
import re
|
import threading
|
||||||
import ejde_save
|
import ejde_save
|
||||||
|
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
@ -13,8 +14,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
Total number of papers: 2023/08/08 - 4761
|
Total number of papers: 2023/08/08 - 4785
|
||||||
Total Time via VPN w/100ms-delay: 306.73s
|
Total Time via VPN w/100ms-delay: 48.04s
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
@ -23,6 +24,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def save_data_thread_safe(data, data_lock, data_type):
|
||||||
|
with data_lock:
|
||||||
|
ejde_save.save_data(data, f"{data_type}", str(uuid.uuid4()) + ".json")
|
||||||
|
data.clear()
|
||||||
|
|
||||||
|
|
||||||
def datetime_transform(date):
|
def datetime_transform(date):
|
||||||
month_typo = {
|
month_typo = {
|
||||||
"Janaury": "January",
|
"Janaury": "January",
|
||||||
@ -120,7 +127,7 @@ def process_html_article(baseweb, article):
|
|||||||
# Get article title & url
|
# Get article title & url
|
||||||
try:
|
try:
|
||||||
title = article.text.strip()
|
title = article.text.strip()
|
||||||
title = re.sub(r'\s+', ' ', title).strip()
|
title = str(re.sub(r'\s+', ' ', title).strip())
|
||||||
article_url = baseweb + article.find_next("a")["href"]
|
article_url = baseweb + article.find_next("a")["href"]
|
||||||
if "../../index.html" in article_url:
|
if "../../index.html" in article_url:
|
||||||
print("Redundant URL:", article_url)
|
print("Redundant URL:", article_url)
|
||||||
@ -162,11 +169,11 @@ def process_article(title, article_url):
|
|||||||
# Extract title if title == None
|
# Extract title if title == None
|
||||||
if not title:
|
if not title:
|
||||||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
|
||||||
|
|
||||||
# Extract issue
|
# Extract issue
|
||||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||||
issue = issue_match.group(1) if issue_match else None
|
issue = issue_match.group(1) if issue_match else ""
|
||||||
|
|
||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||||
@ -189,21 +196,23 @@ def process_article(title, article_url):
|
|||||||
volume = str(volume)
|
volume = str(volume)
|
||||||
issue = "Conference " + str(issue_number)
|
issue = "Conference " + str(issue_number)
|
||||||
else:
|
else:
|
||||||
volume = None
|
volume = ""
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||||
pp = pp_match.group(1) if pp_match else None
|
pp = pp_match.group(1) if pp_match else ""
|
||||||
|
|
||||||
# Extract submission date
|
# Extract submission date
|
||||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
submitted_date = match.group(1) if match else None
|
if not match:
|
||||||
|
match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
|
||||||
|
submitted_date = match.group(1) if match else ""
|
||||||
if submitted_date:
|
if submitted_date:
|
||||||
submitted_date = datetime_transform(submitted_date)
|
submitted_date = datetime_transform(submitted_date)
|
||||||
|
|
||||||
# Extract publication date
|
# Extract publication date
|
||||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
publish_date = match.group(1) if match else None
|
publish_date = match.group(1) if match else ""
|
||||||
if publish_date:
|
if publish_date:
|
||||||
publish_date = datetime_transform(publish_date)
|
publish_date = datetime_transform(publish_date)
|
||||||
|
|
||||||
@ -234,25 +243,25 @@ def process_article(title, article_url):
|
|||||||
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||||||
if not doi_match:
|
if not doi_match:
|
||||||
doi_match = re.search(r'DOI: (.+)', html)
|
doi_match = re.search(r'DOI: (.+)', html)
|
||||||
doi = doi_match.group(1) if doi_match else None
|
doi = doi_match.group(1) if doi_match else ""
|
||||||
doi = doi.replace('https://doi.org/', '') # strip doi website header
|
|
||||||
|
|
||||||
# Article_id
|
# Article_id
|
||||||
article_id = str(uuid.uuid4())
|
article_id = str(uuid.uuid4())
|
||||||
|
|
||||||
# Author info
|
# Author info
|
||||||
authors = []
|
authors = []
|
||||||
|
author_names = []
|
||||||
table = article_soup.find('table')
|
table = article_soup.find('table')
|
||||||
if table:
|
if table:
|
||||||
for row in table.find_all('tr'):
|
for row in table.find_all('tr'):
|
||||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
if "email:" in cell:
|
if "email" in cell:
|
||||||
cell = cell.split("email:")
|
cell = cell.split("email")
|
||||||
email_list = str(cell[1]).split(',')
|
email_list = str(cell[1]).split(',')
|
||||||
cell = cell[0]
|
cell = cell[0]
|
||||||
elif "e-mail:" in cell:
|
elif "e-mail" in cell:
|
||||||
cell = cell.split("e-mail:")
|
cell = cell.split("e-mail")
|
||||||
email_list = str(cell[1]).split(',')
|
email_list = str(cell[1]).split(',')
|
||||||
cell = cell[0]
|
cell = cell[0]
|
||||||
else:
|
else:
|
||||||
@ -264,8 +273,11 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if cell[0]:
|
if cell[0]:
|
||||||
authors.append(unidecode(cell[0]))
|
author_id = str(uuid.uuid4())
|
||||||
name = re.split(r'[ .]', cell[0])
|
authors.append(author_id)
|
||||||
|
author_names.append(unidecode(cell[0]))
|
||||||
|
name = re.split(r'\s+', cell[0])
|
||||||
|
name = [item for item in name if item != '']
|
||||||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||||
@ -276,16 +288,19 @@ def process_article(title, article_url):
|
|||||||
emails.append(unidecode(email_match.group())) if email_match else None
|
emails.append(unidecode(email_match.group())) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": author_id,
|
||||||
"from_article": [article_id],
|
"from_article": article_id,
|
||||||
"first_name": unidecode(name[0]),
|
"first_name": unidecode(name[0]),
|
||||||
"last_name": unidecode(name[-1]),
|
"last_name": unidecode(name[-1]),
|
||||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||||
"affiliation": [{
|
"raw_name": unidecode(cell[0]),
|
||||||
"year": volume,
|
"affiliation": [
|
||||||
"affiliation": unidecode(affiliation),
|
{
|
||||||
"email": emails
|
"year": volume,
|
||||||
}]
|
"affiliation": unidecode(affiliation),
|
||||||
|
"email": ", ".join(emails)
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
authorNum += 1
|
authorNum += 1
|
||||||
@ -308,12 +323,12 @@ def process_article(title, article_url):
|
|||||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||||
matches = matches.split("<p>")
|
matches = matches.split("<p>")
|
||||||
for match in matches:
|
for match in matches:
|
||||||
if "email:" in match:
|
if "email" in match:
|
||||||
match = match.split("email:")
|
match = match.split("email")
|
||||||
email_list = str(match[1]).split(',')
|
email_list = str(match[1]).split(',')
|
||||||
match = match[0]
|
match = match[0]
|
||||||
elif "e-mail:" in match:
|
elif "e-mail" in match:
|
||||||
match = match.split("e-mail:")
|
match = match.split("e-mail")
|
||||||
email_list = str(match[1]).split(',')
|
email_list = str(match[1]).split(',')
|
||||||
match = match[0]
|
match = match[0]
|
||||||
else:
|
else:
|
||||||
@ -330,8 +345,11 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if match[0]:
|
if match[0]:
|
||||||
|
author_id = str(uuid.uuid4())
|
||||||
|
authors.append(author_id)
|
||||||
authors.append(unidecode(match[0]))
|
authors.append(unidecode(match[0]))
|
||||||
name = re.split(r'[ .]', match[0])
|
name = re.split(r'\s+', match[0])
|
||||||
|
name = [item for item in name if item != '']
|
||||||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||||
@ -342,16 +360,19 @@ def process_article(title, article_url):
|
|||||||
emails.append(unidecode(email_match.group())) if email_match else None
|
emails.append(unidecode(email_match.group())) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": author_id,
|
||||||
"from_article": [article_id],
|
"from_article": article_id,
|
||||||
"first_name": unidecode(name[0]),
|
"first_name": unidecode(name[0]),
|
||||||
"last_name": unidecode(name[-1]),
|
"last_name": unidecode(name[-1]),
|
||||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||||
"affiliation": [{
|
"raw_name": unidecode(match[0]),
|
||||||
"year": volume,
|
"affiliation": [
|
||||||
"affiliation": unidecode(affiliation),
|
{
|
||||||
"email": emails
|
"year": volume,
|
||||||
}]
|
"affiliation": unidecode(affiliation),
|
||||||
|
"email": ", ".join(emails)
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
authorNum += 1
|
authorNum += 1
|
||||||
@ -368,7 +389,7 @@ def process_article(title, article_url):
|
|||||||
"article_id": article_id,
|
"article_id": article_id,
|
||||||
"title": unidecode(title),
|
"title": unidecode(title),
|
||||||
"authors": authors,
|
"authors": authors,
|
||||||
"corresponding_authors": None,
|
"author_names": author_names,
|
||||||
"submit_datetime": submitted_date,
|
"submit_datetime": submitted_date,
|
||||||
"publish_datetime": publish_date,
|
"publish_datetime": publish_date,
|
||||||
"keywords": keywords,
|
"keywords": keywords,
|
||||||
@ -386,12 +407,10 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Save the data periodically based on batch size
|
# Save the data periodically based on batch size
|
||||||
if len(articleData) % batch_size == 0:
|
if len(articleData) % batch_size == 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
save_data_thread_safe(articleData, articleDataLock, "Article_TS")
|
||||||
articleData.clear()
|
|
||||||
|
|
||||||
if len(authorData) % batch_size == 0:
|
if len(authorData) % batch_size == 0:
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
save_data_thread_safe(authorData, authorDataLock, "Author_TS")
|
||||||
authorData.clear()
|
|
||||||
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@ -429,6 +448,8 @@ authorNum = 0
|
|||||||
articleNum = 0
|
articleNum = 0
|
||||||
|
|
||||||
batch_size = 100 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
|
authorDataLock = threading.Lock()
|
||||||
|
articleDataLock = threading.Lock()
|
||||||
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
@ -444,9 +465,9 @@ for future in as_completed(futures):
|
|||||||
# Retry failed processing paper
|
# Retry failed processing paper
|
||||||
print("START RETRYING:", len(failedData))
|
print("START RETRYING:", len(failedData))
|
||||||
while failedData:
|
while failedData:
|
||||||
data = failedData.pop(0)
|
fail_data = failedData.pop(0)
|
||||||
articleTitle = data["title"]
|
articleTitle = fail_data["title"]
|
||||||
articleUrl = data["URL"]
|
articleUrl = fail_data["URL"]
|
||||||
try:
|
try:
|
||||||
process_article(articleTitle, articleUrl)
|
process_article(articleTitle, articleUrl)
|
||||||
except Exception as retry_err:
|
except Exception as retry_err:
|
||||||
@ -460,11 +481,11 @@ while failedData:
|
|||||||
# Save remaining data
|
# Save remaining data
|
||||||
if len(articleData) > 0:
|
if len(articleData) > 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||||
|
|
||||||
if len(authorData) > 0:
|
if len(authorData) > 0:
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||||
|
|
||||||
# Save error record
|
# Save error record
|
||||||
if len(totallyFailedData) > 0:
|
if len(totallyFailedData) > 0:
|
||||||
@ -485,5 +506,5 @@ print("Total fetched author:", authorNum)
|
|||||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||||
|
|
||||||
# Transfer to large file and delete the temporary storage files
|
# Transfer to large file and delete the temporary storage files
|
||||||
ejde_save.Transf()
|
ejde_save.transform_data()
|
||||||
ejde_save.delete()
|
# ejde_save.delete_data()
|
||||||
|
|||||||
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):
|
|||||||
|
|
||||||
|
|
||||||
# Write into output files
|
# Write into output files
|
||||||
def Transf():
|
def transform_data():
|
||||||
def Read(folder_path, output_files):
|
def read(folder_path, output_files):
|
||||||
# Create new folders
|
# Create new folders
|
||||||
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
||||||
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
||||||
@ -24,6 +24,8 @@ def Transf():
|
|||||||
data_2010_2014 = []
|
data_2010_2014 = []
|
||||||
data_2015_2020 = []
|
data_2015_2020 = []
|
||||||
data_newest = []
|
data_newest = []
|
||||||
|
data_no_date = []
|
||||||
|
data_integrate = []
|
||||||
|
|
||||||
for filename in os.listdir(folder_path):
|
for filename in os.listdir(folder_path):
|
||||||
if filename.endswith('.json'):
|
if filename.endswith('.json'):
|
||||||
@ -31,24 +33,29 @@ def Transf():
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
|
||||||
# Select data
|
# Select data
|
||||||
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||||||
data_oldest.append(Dict)
|
data_oldest.append(Dict)
|
||||||
|
|
||||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
|
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
|
||||||
data_2010_2014.append(Dict)
|
data_2010_2014.append(Dict)
|
||||||
|
|
||||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||||||
data_2015_2020.append(Dict)
|
data_2015_2020.append(Dict)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
data_newest.append(Dict)
|
data_newest.append(Dict)
|
||||||
|
else:
|
||||||
|
data_no_date.append(Dict)
|
||||||
|
|
||||||
|
data_integrate.append(data_oldest)
|
||||||
|
data_integrate.append(data_2010_2014)
|
||||||
|
data_integrate.append(data_2015_2020)
|
||||||
|
data_integrate.append(data_newest)
|
||||||
|
data_integrate.append(data_no_date)
|
||||||
|
|
||||||
# Transfer
|
# Transfer
|
||||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
|
||||||
|
|
||||||
for index in range(0, 4):
|
for index in range(0, 6):
|
||||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
json.dump(Data[index], file, indent=4)
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
@ -61,26 +68,30 @@ def Transf():
|
|||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
|
||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
|
||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
|
||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
|
||||||
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
|
||||||
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
|
||||||
]
|
]
|
||||||
|
|
||||||
article_output_file = [
|
article_output_file = [
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
|
||||||
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
|
||||||
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
|
||||||
]
|
]
|
||||||
|
|
||||||
# Read and write into files
|
# Read and write into files
|
||||||
Read(author_folder_path, author_output_file)
|
read(author_folder_path, author_output_file)
|
||||||
Read(article_folder_path, article_output_file)
|
read(article_folder_path, article_output_file)
|
||||||
|
|
||||||
# End
|
# End
|
||||||
print("\nData has been written into files.")
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
|
||||||
# Delete files in temporary storage area
|
# Delete files in temporary storage area
|
||||||
def delete():
|
def delete_data():
|
||||||
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
||||||
for folder_path in folder_paths:
|
for folder_path in folder_paths:
|
||||||
file_names = os.listdir(folder_path)
|
file_names = os.listdir(folder_path)
|
||||||
@ -89,5 +100,4 @@ def delete():
|
|||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
os.rmdir(folder_path)
|
os.rmdir(folder_path)
|
||||||
|
|
||||||
print('\nAttention: The temporary storage files have been deleted!')
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user