From 27707a058ccf63a8bbe094fda6bb5275fac833c1 Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Sun, 13 Aug 2023 21:29:10 +0800 Subject: [PATCH] Change the data structure --- 01_EJDE_spider/ejde_main.py | 28 ++++++++++++++++------------ 02_EJQTDE_spider/ejqtde_scrawler.py | 22 ++++++++-------------- 04_SpringerOpen_spider/SD_detail.py | 15 +++++++-------- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 1876b22..4ae686f 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -7,7 +7,7 @@ import ejde_save from retrying import retry from datetime import datetime from bs4 import BeautifulSoup -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor, as_completed, wait ''' 爬取网站:'ejde.math.txstate.edu' @@ -184,7 +184,7 @@ def process_article(title, article_url): msc = msc_match.group(1).strip().strip('.').strip() msc = re.split(r', |;', msc) else: - msc = None + msc = [] # Extract KeyWords keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) @@ -195,7 +195,7 @@ def process_article(title, article_url): keywords = re.split(r', |;', keywords) keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] else: - keywords = None + keywords = [] # Extract DOI doi_match = re.search(r'DOI: (.+)(?=<)', html) @@ -220,6 +220,7 @@ def process_article(title, article_url): # Data processing authors.append(cell[0]) name = cell[0].split(" ") + middle_name = ''.join(name[1:-1]) if name[1:-1] else None affiliation = ', '.join(cell[1:-1]) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) @@ -227,10 +228,10 @@ def process_article(title, article_url): author_data = { "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None, + "from_article": [article_id], + "first_name": name[0], + "last_name": name[-1], + "middle_name": middle_name, "affiliation": [{ "year": volume, "affiliation": affiliation, @@ -256,6 +257,7 @@ def process_article(title, article_url): # Data processing authors.append(match[0]) name = match[0].split(" ") + middle_name = ''.join(name[1:-1]) if name[1:-1] else None affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) @@ -263,10 +265,10 @@ def process_article(title, article_url): author_data = { "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None, + "from_article": [article_id], + "first_name": name[0], + "last_name": name[-1], + "middle_name": middle_name, "affiliation": [{ "year": volume, "affiliation": affiliation, @@ -345,6 +347,8 @@ for future in as_completed(futures): except Exception as vol_err: print("VOLUME PROCESSING ERROR:", str(vol_err)) +wait(futures) + # Retry failed processing paper print("START RETRYING:", len(failedData)) while failedData: @@ -390,4 +394,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -# ejde_save.delete() +ejde_save.delete() diff --git a/02_EJQTDE_spider/ejqtde_scrawler.py b/02_EJQTDE_spider/ejqtde_scrawler.py index 9601f6f..9c1bd5d 100644 --- a/02_EJQTDE_spider/ejqtde_scrawler.py +++ b/02_EJQTDE_spider/ejqtde_scrawler.py @@ -63,10 +63,10 @@ def author_detail(Data, Year, article_id, Author_list): author_data = { "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": Firstname, - "lastname": Lastname, - "middlename": Middlename, + "from_article": [article_id], + "first_name": Firstname, + "last_name": Lastname, + "middle_name": Middlename, "affiliation": [ { "year": Year, @@ -94,24 +94,18 @@ def article_detail(Data, URL, article_id, Aricle_list): del Author[-1] # Submit_datetime and publish_datetime - def timeSet(time): - time = time.split('-') - time[1] = time[1].strip('0') - time = time[0] + '-' + time[1] + '-' + time[2] - return time - time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) time = re.findall(r'\d+-\d+-\d+', str(time)) - Submit_date = timeSet(time[0]) if time[0] else None - Publish_date = timeSet(time[1]) if time[1] else None + Submit_date = time[0] if time[0] else None + Publish_date = time[1] if time[1] else None # Keyword Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None - Keyword = Keyword.split(', ') if Keyword is not None else None + Keyword = Keyword.split(', ') if Keyword is not None else [] # MSC MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None - MSC = MSC.split(', ') if MSC is not None else None + MSC = MSC.split(', ') if MSC is not None else [] # DOI if len(re.findall(r' 0: diff --git a/04_SpringerOpen_spider/SD_detail.py b/04_SpringerOpen_spider/SD_detail.py index 2aae1f9..98c78b4 100644 --- a/04_SpringerOpen_spider/SD_detail.py +++ b/04_SpringerOpen_spider/SD_detail.py @@ -1,5 +1,6 @@ import uuid import calendar +from datetime import datetime # ==========获取细节========== @@ -36,9 +37,9 @@ def Author_dict(soup, article_id, Author_list): author_data = { "author_id": str(uuid.uuid4()), "from_article": article_id, - "firstname": Firstname, - "lastname": Lastname, - "middlename": Middlename, + "first _name": Firstname, + "last_name": Lastname, + "middle_name": Middlename, "affiliation": [ { "year": Year, @@ -87,10 +88,8 @@ def Article_dict(soup, url, article_id): Time = [] def timeSet(time): - time = time.split(' ') - time[1] = str(list(calendar.month_name).index(time[1])) - time = time[2] + '-' + time[1] + '-' + time[0] - return time + input_date = datetime.strptime(time, "%d %B %Y") + return input_date.strftime("%Y-%m-%d") time_list = info.find('ul', class_='c-bibliographic-information__list') times = time_list.find_all('time') @@ -112,7 +111,7 @@ def Article_dict(soup, url, article_id): Keyword.append(keyword) # MSC - MSC = None # SpringerOpen.com does not have MSC + MSC = [] # SpringerOpen.com does not have MSC # DOI DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')