diff --git a/SpringerOpen_spider/SD_detail.py b/SpringerOpen_spider/SD_detail.py index 73639c7..2aae1f9 100644 --- a/SpringerOpen_spider/SD_detail.py +++ b/SpringerOpen_spider/SD_detail.py @@ -1,4 +1,6 @@ import uuid +import calendar + # ==========获取细节========== def Author_dict(soup, article_id, Author_list): @@ -20,10 +22,12 @@ def Author_dict(soup, article_id, Author_list): Middlename = ''.join(author[1:-1]) if author[1:-1] else None # Year - Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() + Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\ + ('span', attrs={'data-test': 'article-publication-year'}) else None # Affiliation - Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() + Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\ + ('p', class_='c-article-author-affiliation__address') else None # Email Email = None # Can not reach the data @@ -72,7 +76,7 @@ def Article_dict(soup, url, article_id): corresponding_author_list = info.find('p', id='corresponding-author-list') corresponding_authors = corresponding_author_list.find_all('a') - if Corresponding_author is not None: + if corresponding_authors is not None: for corresponding_author in corresponding_authors: corresponding_author = corresponding_author.get_text() corresponding_author = [char.replace('-', '') for char in corresponding_author] @@ -82,14 +86,20 @@ def Article_dict(soup, url, article_id): # Submitted_datetime & Published_datetime Time = [] + def timeSet(time): + time = time.split(' ') + time[1] = str(list(calendar.month_name).index(time[1])) + time = time[2] + '-' + time[1] + '-' + time[0] + return time + time_list = info.find('ul', class_='c-bibliographic-information__list') times = time_list.find_all('time') for time in times: time = time.get_text() Time.append(time) - Submit_date = Time[0] - Publish_date = Time[-1] + Submit_date = timeSet(str(Time[0])) + Publish_date = timeSet(str(Time[-1])) # keyword Keyword = [] # A new empty list @@ -145,4 +155,4 @@ def Article_dict(soup, url, article_id): "page": Page, } - return article_data \ No newline at end of file + return article_data diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py index d2b8ed2..3f9099c 100644 --- a/SpringerOpen_spider/SD_scrawl.py +++ b/SpringerOpen_spider/SD_scrawl.py @@ -12,7 +12,7 @@ def Scrawl(Link, Article_data, Author_data): # 访问论文列表页 headers = SD_link.header() soup = SD_link.Link(Link, headers) - print(Link) + print('Start: ', Link) # 获得所有论文详情页的链接 Essay_Ol = soup.find('ol') # 获取论文列表