Changed the code to unify the time format

This commit is contained in:
XCX 2023-07-31 18:19:11 +08:00
parent ee0f956645
commit 01c1a7d978
2 changed files with 17 additions and 7 deletions

View File

@ -1,4 +1,6 @@
import uuid import uuid
import calendar
# ==========获取细节========== # ==========获取细节==========
def Author_dict(soup, article_id, Author_list): def Author_dict(soup, article_id, Author_list):
@ -20,10 +22,12 @@ def Author_dict(soup, article_id, Author_list):
Middlename = ''.join(author[1:-1]) if author[1:-1] else None Middlename = ''.join(author[1:-1]) if author[1:-1] else None
# Year # Year
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\
('span', attrs={'data-test': 'article-publication-year'}) else None
# Affiliation # Affiliation
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\
('p', class_='c-article-author-affiliation__address') else None
# Email # Email
Email = None # Can not reach the data Email = None # Can not reach the data
@ -72,7 +76,7 @@ def Article_dict(soup, url, article_id):
corresponding_author_list = info.find('p', id='corresponding-author-list') corresponding_author_list = info.find('p', id='corresponding-author-list')
corresponding_authors = corresponding_author_list.find_all('a') corresponding_authors = corresponding_author_list.find_all('a')
if Corresponding_author is not None: if corresponding_authors is not None:
for corresponding_author in corresponding_authors: for corresponding_author in corresponding_authors:
corresponding_author = corresponding_author.get_text() corresponding_author = corresponding_author.get_text()
corresponding_author = [char.replace('-', '') for char in corresponding_author] corresponding_author = [char.replace('-', '') for char in corresponding_author]
@ -82,14 +86,20 @@ def Article_dict(soup, url, article_id):
# Submitted_datetime & Published_datetime # Submitted_datetime & Published_datetime
Time = [] Time = []
def timeSet(time):
time = time.split(' ')
time[1] = str(list(calendar.month_name).index(time[1]))
time = time[2] + '-' + time[1] + '-' + time[0]
return time
time_list = info.find('ul', class_='c-bibliographic-information__list') time_list = info.find('ul', class_='c-bibliographic-information__list')
times = time_list.find_all('time') times = time_list.find_all('time')
for time in times: for time in times:
time = time.get_text() time = time.get_text()
Time.append(time) Time.append(time)
Submit_date = Time[0] Submit_date = timeSet(str(Time[0]))
Publish_date = Time[-1] Publish_date = timeSet(str(Time[-1]))
# keyword # keyword
Keyword = [] # A new empty list Keyword = [] # A new empty list
@ -145,4 +155,4 @@ def Article_dict(soup, url, article_id):
"page": Page, "page": Page,
} }
return article_data return article_data

View File

@ -12,7 +12,7 @@ def Scrawl(Link, Article_data, Author_data):
# 访问论文列表页 # 访问论文列表页
headers = SD_link.header() headers = SD_link.header()
soup = SD_link.Link(Link, headers) soup = SD_link.Link(Link, headers)
print(Link) print('Start: ', Link)
# 获得所有论文详情页的链接 # 获得所有论文详情页的链接
Essay_Ol = soup.find('ol') # 获取论文列表 Essay_Ol = soup.find('ol') # 获取论文列表