diff --git a/Data/Origin/ejde_buffer.zip b/Data/Origin/ejde_buffer.zip index a3f9331..3b1b079 100644 Binary files a/Data/Origin/ejde_buffer.zip and b/Data/Origin/ejde_buffer.zip differ diff --git a/Parsers/00_Citation_spider/get_paper_citation.py b/Parsers/00_Citation_spider/get_paper_citation.py index 6a76345..41f4ea7 100644 --- a/Parsers/00_Citation_spider/get_paper_citation.py +++ b/Parsers/00_Citation_spider/get_paper_citation.py @@ -26,11 +26,12 @@ payload = { jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head) # Aminer API -api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish" -api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list" +api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish" +api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list" +api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search" -def aminer_get_id(title): +def aminer_get_paper_id(title): headers = { "Authorization": f"Bearer {jwt_token}" } @@ -39,7 +40,7 @@ def aminer_get_id(title): "size": "", "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip() } - response = requests.get(api_get_id, headers=headers, params=params) + response = requests.get(api_paper_id, headers=headers, params=params) if response.status_code == 200: data = response.json() @@ -49,7 +50,7 @@ def aminer_get_id(title): not_on_aminer.append(title) -def aminer_post_citation(aminer_id): +def aminer_post_paper_citation(aminer_id): headers = { "Content-Type": "application/json;charset=utf-8", "Authorization": f"Bearer {jwt_token}" @@ -57,7 +58,7 @@ def aminer_post_citation(aminer_id): request_data = { "ids": aminer_id } - response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data)) + response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data)) if response.status_code == 200: data = response.json() @@ -72,6 +73,31 @@ def aminer_post_citation(aminer_id): aminer_paper_citation_retry.append(aminer_id) +def aminer_author_info(author_aminer_id, author_name, offset): + headers = { + "Content-Type": "application/json;charset=utf-8", + "Authorization": f"Bearer {jwt_token}" + } + request_data = { + "ids": author_aminer_id, + "query": author_name, + "offset": offset + } + response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data)) + + if response.status_code == 200: + data = response.json() + if data.get("success"): + for item in data.get('data', []): + if 'n_citation' in item: + n_citation = item['n_citation'] + else: + n_citation = 0 + aminer_paper_citation.append(n_citation) + else: + aminer_paper_citation_retry.append(author_aminer_id) + + def scholarly_get_citation(title): # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session pg = ProxyGenerator() @@ -92,8 +118,7 @@ aminer_paper_citation = [] aminer_paper_citation_retry = [] # scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion") - -aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols") +aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols") if aminer_paper_id: - aminer_post_citation(aminer_paper_id) + aminer_post_paper_citation(aminer_paper_id) print(aminer_paper_citation) diff --git a/Parsers/01_EJDE_spider/ejde_main.py b/Parsers/01_EJDE_spider/ejde_main.py index d4678c0..0245370 100644 --- a/Parsers/01_EJDE_spider/ejde_main.py +++ b/Parsers/01_EJDE_spider/ejde_main.py @@ -1,7 +1,8 @@ +import re import time import uuid import requests -import re +import threading import ejde_save from retrying import retry @@ -13,8 +14,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' - Total number of papers: 2023/08/08 - 4761 - Total Time via VPN w/100ms-delay: 306.73s + Total number of papers: 2023/08/08 - 4785 + Total Time via VPN w/100ms-delay: 48.04s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -23,6 +24,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' +def save_data_thread_safe(data, data_lock, data_type): + with data_lock: + ejde_save.save_data(data, f"{data_type}", str(uuid.uuid4()) + ".json") + data.clear() + + def datetime_transform(date): month_typo = { "Janaury": "January", @@ -120,7 +127,7 @@ def process_html_article(baseweb, article): # Get article title & url try: title = article.text.strip() - title = re.sub(r'\s+', ' ', title).strip() + title = str(re.sub(r'\s+', ' ', title).strip()) article_url = baseweb + article.find_next("a")["href"] if "../../index.html" in article_url: print("Redundant URL:", article_url) @@ -162,11 +169,11 @@ def process_article(title, article_url): # Extract title if title == None if not title: title_match = re.search(r"

(.*?)

", article_text) - title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None + title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else "" # Extract issue issue_match = re.search(r'No\. (\d+)', article_text) - issue = issue_match.group(1) if issue_match else None + issue = issue_match.group(1) if issue_match else "" # Extract volume volume_match = re.search(r'Vol\. (\d+)', article_text) @@ -189,21 +196,23 @@ def process_article(title, article_url): volume = str(volume) issue = "Conference " + str(issue_number) else: - volume = None + volume = "" # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) - pp = pp_match.group(1) if pp_match else None + pp = pp_match.group(1) if pp_match else "" # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) - submitted_date = match.group(1) if match else None + if not match: + match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html) + submitted_date = match.group(1) if match else "" if submitted_date: submitted_date = datetime_transform(submitted_date) # Extract publication date match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) - publish_date = match.group(1) if match else None + publish_date = match.group(1) if match else "" if publish_date: publish_date = datetime_transform(publish_date) @@ -234,25 +243,25 @@ def process_article(title, article_url): doi_match = re.search(r'DOI: ([^\t\n<]+)', html) if not doi_match: doi_match = re.search(r'DOI: (.+)', html) - doi = doi_match.group(1) if doi_match else None - doi = doi.replace('https://doi.org/', '') # strip doi website header + doi = doi_match.group(1) if doi_match else "" # Article_id article_id = str(uuid.uuid4()) # Author info authors = [] + author_names = [] table = article_soup.find('table') if table: for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: - if "email:" in cell: - cell = cell.split("email:") + if "email" in cell: + cell = cell.split("email") email_list = str(cell[1]).split(',') cell = cell[0] - elif "e-mail:" in cell: - cell = cell.split("e-mail:") + elif "e-mail" in cell: + cell = cell.split("e-mail") email_list = str(cell[1]).split(',') cell = cell[0] else: @@ -264,8 +273,11 @@ def process_article(title, article_url): # Data processing if cell[0]: - authors.append(unidecode(cell[0])) - name = re.split(r'[ .]', cell[0]) + author_id = str(uuid.uuid4()) + authors.append(author_id) + author_names.append(unidecode(cell[0])) + name = re.split(r'\s+', cell[0]) + name = [item for item in name if item != ''] affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) affiliation = affiliation.lstrip(",").rstrip(",").strip() @@ -276,16 +288,19 @@ def process_article(title, article_url): emails.append(unidecode(email_match.group())) if email_match else None author_data = { - "author_id": str(uuid.uuid4()), - "from_article": [article_id], + "author_id": author_id, + "from_article": article_id, "first_name": unidecode(name[0]), "last_name": unidecode(name[-1]), - "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None, - "affiliation": [{ - "year": volume, - "affiliation": unidecode(affiliation), - "email": emails - }] + "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "", + "raw_name": unidecode(cell[0]), + "affiliation": [ + { + "year": volume, + "affiliation": unidecode(affiliation), + "email": ", ".join(emails) + } + ] } authorData.append(author_data) authorNum += 1 @@ -308,12 +323,12 @@ def process_article(title, article_url): matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.split("

") for match in matches: - if "email:" in match: - match = match.split("email:") + if "email" in match: + match = match.split("email") email_list = str(match[1]).split(',') match = match[0] - elif "e-mail:" in match: - match = match.split("e-mail:") + elif "e-mail" in match: + match = match.split("e-mail") email_list = str(match[1]).split(',') match = match[0] else: @@ -330,8 +345,11 @@ def process_article(title, article_url): # Data processing if match[0]: + author_id = str(uuid.uuid4()) + authors.append(author_id) authors.append(unidecode(match[0])) - name = re.split(r'[ .]', match[0]) + name = re.split(r'\s+', match[0]) + name = [item for item in name if item != ''] affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) affiliation = affiliation.lstrip(",").rstrip(",").strip() @@ -342,16 +360,19 @@ def process_article(title, article_url): emails.append(unidecode(email_match.group())) if email_match else None author_data = { - "author_id": str(uuid.uuid4()), - "from_article": [article_id], + "author_id": author_id, + "from_article": article_id, "first_name": unidecode(name[0]), "last_name": unidecode(name[-1]), - "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None, - "affiliation": [{ - "year": volume, - "affiliation": unidecode(affiliation), - "email": emails - }] + "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "", + "raw_name": unidecode(match[0]), + "affiliation": [ + { + "year": volume, + "affiliation": unidecode(affiliation), + "email": ", ".join(emails) + } + ] } authorData.append(author_data) authorNum += 1 @@ -368,7 +389,7 @@ def process_article(title, article_url): "article_id": article_id, "title": unidecode(title), "authors": authors, - "corresponding_authors": None, + "author_names": author_names, "submit_datetime": submitted_date, "publish_datetime": publish_date, "keywords": keywords, @@ -386,12 +407,10 @@ def process_article(title, article_url): # Save the data periodically based on batch size if len(articleData) % batch_size == 0: - ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") - articleData.clear() + save_data_thread_safe(articleData, articleDataLock, "Article_TS") if len(authorData) % batch_size == 0: - ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") - authorData.clear() + save_data_thread_safe(authorData, authorDataLock, "Author_TS") start_time = time.time() @@ -429,6 +448,8 @@ authorNum = 0 articleNum = 0 batch_size = 100 # Number of articles to process before saving +authorDataLock = threading.Lock() +articleDataLock = threading.Lock() executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads # Process each URL using multithreading @@ -444,9 +465,9 @@ for future in as_completed(futures): # Retry failed processing paper print("START RETRYING:", len(failedData)) while failedData: - data = failedData.pop(0) - articleTitle = data["title"] - articleUrl = data["URL"] + fail_data = failedData.pop(0) + articleTitle = fail_data["title"] + articleUrl = fail_data["URL"] try: process_article(articleTitle, articleUrl) except Exception as retry_err: @@ -460,11 +481,11 @@ while failedData: # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") +print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") - print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") +print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") # Save error record if len(totallyFailedData) > 0: @@ -485,5 +506,5 @@ print("Total fetched author:", authorNum) print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files -ejde_save.Transf() -ejde_save.delete() +ejde_save.transform_data() +# ejde_save.delete_data() diff --git a/Parsers/01_EJDE_spider/ejde_save.py b/Parsers/01_EJDE_spider/ejde_save.py index 7ca56d9..1c6f95b 100644 --- a/Parsers/01_EJDE_spider/ejde_save.py +++ b/Parsers/01_EJDE_spider/ejde_save.py @@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename): # Write into output files -def Transf(): - def Read(folder_path, output_files): +def transform_data(): + def read(folder_path, output_files): # Create new folders os.makedirs('./ejde_buffer/Article_output/', exist_ok=True) os.makedirs('./ejde_buffer/Author_output/', exist_ok=True) @@ -24,6 +24,8 @@ def Transf(): data_2010_2014 = [] data_2015_2020 = [] data_newest = [] + data_no_date = [] + data_integrate = [] for filename in os.listdir(folder_path): if filename.endswith('.json'): @@ -31,24 +33,29 @@ def Transf(): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) for Dict in data: - if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '': # Select data if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009: data_oldest.append(Dict) - elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014: data_2010_2014.append(Dict) - elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020: data_2015_2020.append(Dict) - else: data_newest.append(Dict) + else: + data_no_date.append(Dict) + + data_integrate.append(data_oldest) + data_integrate.append(data_2010_2014) + data_integrate.append(data_2015_2020) + data_integrate.append(data_newest) + data_integrate.append(data_no_date) # Transfer - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate] - for index in range(0, 4): + for index in range(0, 6): with open(output_files[index], 'w', encoding='utf-8') as file: json.dump(Data[index], file, indent=4) @@ -61,26 +68,30 @@ def Transf(): './ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json', './ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json', './ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json', - './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json' + './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json', + './ejde_buffer/Author_output/EJDE_Author_output_file(no date).json', + './ejde_buffer/Author_output/EJDE_Author_output_file(integration).json' ] article_output_file = [ './ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json', './ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json', './ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json', - './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json' + './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json', + './ejde_buffer/Article_output/EJDE_Article_output_file(no date).json', + './ejde_buffer/Article_output/EJDE_Article_output_file(integration).json' ] # Read and write into files - Read(author_folder_path, author_output_file) - Read(article_folder_path, article_output_file) + read(author_folder_path, author_output_file) + read(article_folder_path, article_output_file) # End print("\nData has been written into files.") # Delete files in temporary storage area -def delete(): +def delete_data(): folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS'] for folder_path in folder_paths: file_names = os.listdir(folder_path) @@ -89,5 +100,4 @@ def delete(): if os.path.isfile(file_path): os.remove(file_path) os.rmdir(folder_path) - print('\nAttention: The temporary storage files have been deleted!')