From 01c1a7d97830c11404af1e3dcd23fd39641bf31b Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Mon, 31 Jul 2023 18:19:11 +0800 Subject: [PATCH] Changed the code to unify the time format --- SpringerOpen_spider/SD_detail.py | 22 ++++++++++++++++------ SpringerOpen_spider/SD_scrawl.py | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py index 73639c7..2aae1f9 100644 --- a/SpringerOpen_spider/SD_detail.py +++ b/SpringerOpen_spider/SD_detail.py @@ -1,4 +1,6 @@ import uuid +import calendar + # ==========获取细节========== def Author_dict(soup, article_id, Author_list): @@ -20,10 +22,12 @@ def Author_dict(soup, article_id, Author_list): Middlename = ''.join(author[1:-1]) if author[1:-1] else None # Year - Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() + Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\ + ('span', attrs={'data-test': 'article-publication-year'}) else None # Affiliation - Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() + Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\ + ('p', class_='c-article-author-affiliation__address') else None # Email Email = None # Can not reach the data @@ -72,7 +76,7 @@ def Article_dict(soup, url, article_id): corresponding_author_list = info.find('p', id='corresponding-author-list') corresponding_authors = corresponding_author_list.find_all('a') - if Corresponding_author is not None: + if corresponding_authors is not None: for corresponding_author in corresponding_authors: corresponding_author = corresponding_author.get_text() corresponding_author = [char.replace('-', '') for char in corresponding_author] @@ -82,14 +86,20 @@ def Article_dict(soup, url, article_id): # Submitted_datetime & Published_datetime Time = [] + def timeSet(time): + time = time.split(' ') + time[1] = str(list(calendar.month_name).index(time[1])) + time = time[2] + '-' + time[1] + '-' + time[0] + return time + time_list = info.find('ul', class_='c-bibliographic-information__list') times = time_list.find_all('time') for time in times: time = time.get_text() Time.append(time) - Submit_date = Time[0] - Publish_date = Time[-1] + Submit_date = timeSet(str(Time[0])) + Publish_date = timeSet(str(Time[-1])) # keyword Keyword = [] # A new empty list @@ -145,4 +155,4 @@ def Article_dict(soup, url, article_id): "page": Page, } - return article_data \ No newline at end of file + return article_data diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py index d2b8ed2..3f9099c 100644 --- a/SpringerOpen_spider/SD_scrawl.py +++ b/SpringerOpen_spider/SD_scrawl.py @@ -12,7 +12,7 @@ def Scrawl(Link, Article_data, Author_data): # 访问论文列表页 headers = SD_link.header() soup = SD_link.Link(Link, headers) - print(Link) + print('Start: ', Link) # 获得所有论文详情页的链接 Essay_Ol = soup.find('ol') # 获取论文列表