Changed the code to unify the time format

2023-07-31 18:19:11 +08:00 · 2023-07-31 18:19:11 +08:00 · 01c1a7d978
commit 01c1a7d978
parent ee0f956645
2 changed files with 17 additions and 7 deletions
--- a/SpringerOpen_spider/SD_detail.py
+++ b/SpringerOpen_spider/SD_detail.py
@ -1,4 +1,6 @@
 import uuid
 import calendar
 # ==========获取细节==========
 def Author_dict(soup, article_id, Author_list):
@ -20,10 +22,12 @@ def Author_dict(soup, article_id, Author_list):
        Middlename = ''.join(author[1:-1]) if author[1:-1] else None
        # Year
-        Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
+        Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\
            ('span', attrs={'data-test': 'article-publication-year'}) else None
        # Affiliation
-        Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
+        Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\
            ('p', class_='c-article-author-affiliation__address') else None
        # Email
        Email = None    # Can not reach the data
@ -72,7 +76,7 @@ def Article_dict(soup, url, article_id):
    corresponding_author_list = info.find('p', id='corresponding-author-list')
    corresponding_authors = corresponding_author_list.find_all('a')
-    if Corresponding_author is not None:
+    if corresponding_authors is not None:
        for corresponding_author in corresponding_authors:
            corresponding_author = corresponding_author.get_text()
            corresponding_author = [char.replace('-', '') for char in corresponding_author]
@ -82,14 +86,20 @@ def Article_dict(soup, url, article_id):
    # Submitted_datetime & Published_datetime
    Time = []
    def timeSet(time):
        time = time.split(' ')
        time[1] = str(list(calendar.month_name).index(time[1]))
        time = time[2] + '-' + time[1] + '-' + time[0]
        return time
    time_list = info.find('ul', class_='c-bibliographic-information__list')
    times = time_list.find_all('time')
    for time in times:
        time = time.get_text()
        Time.append(time)
-    Submit_date = Time[0]
+    Submit_date = timeSet(str(Time[0]))
-    Publish_date = Time[-1]
+    Publish_date = timeSet(str(Time[-1]))
    # keyword
    Keyword = []        # A new empty list
@ -145,4 +155,4 @@ def Article_dict(soup, url, article_id):
        "page": Page,
    }
-    return article_data
+    return article_data
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@ -12,7 +12,7 @@ def Scrawl(Link, Article_data, Author_data):
    # 访问论文列表页
    headers = SD_link.header()
    soup = SD_link.Link(Link, headers)
-    print(Link)
+    print('Start: ', Link)
    # 获得所有论文详情页的链接
    Essay_Ol = soup.find('ol')          # 获取论文列表