Changed the code to unify the time format
This commit is contained in:
parent
ee0f956645
commit
01c1a7d978
@ -1,4 +1,6 @@
|
||||
import uuid
|
||||
import calendar
|
||||
|
||||
|
||||
# ==========获取细节==========
|
||||
def Author_dict(soup, article_id, Author_list):
|
||||
@ -20,10 +22,12 @@ def Author_dict(soup, article_id, Author_list):
|
||||
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
||||
|
||||
# Year
|
||||
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
|
||||
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\
|
||||
('span', attrs={'data-test': 'article-publication-year'}) else None
|
||||
|
||||
# Affiliation
|
||||
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
|
||||
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\
|
||||
('p', class_='c-article-author-affiliation__address') else None
|
||||
|
||||
# Email
|
||||
Email = None # Can not reach the data
|
||||
@ -72,7 +76,7 @@ def Article_dict(soup, url, article_id):
|
||||
|
||||
corresponding_author_list = info.find('p', id='corresponding-author-list')
|
||||
corresponding_authors = corresponding_author_list.find_all('a')
|
||||
if Corresponding_author is not None:
|
||||
if corresponding_authors is not None:
|
||||
for corresponding_author in corresponding_authors:
|
||||
corresponding_author = corresponding_author.get_text()
|
||||
corresponding_author = [char.replace('-', '') for char in corresponding_author]
|
||||
@ -82,14 +86,20 @@ def Article_dict(soup, url, article_id):
|
||||
# Submitted_datetime & Published_datetime
|
||||
Time = []
|
||||
|
||||
def timeSet(time):
|
||||
time = time.split(' ')
|
||||
time[1] = str(list(calendar.month_name).index(time[1]))
|
||||
time = time[2] + '-' + time[1] + '-' + time[0]
|
||||
return time
|
||||
|
||||
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
||||
times = time_list.find_all('time')
|
||||
for time in times:
|
||||
time = time.get_text()
|
||||
Time.append(time)
|
||||
|
||||
Submit_date = Time[0]
|
||||
Publish_date = Time[-1]
|
||||
Submit_date = timeSet(str(Time[0]))
|
||||
Publish_date = timeSet(str(Time[-1]))
|
||||
|
||||
# keyword
|
||||
Keyword = [] # A new empty list
|
||||
|
||||
@ -12,7 +12,7 @@ def Scrawl(Link, Article_data, Author_data):
|
||||
# 访问论文列表页
|
||||
headers = SD_link.header()
|
||||
soup = SD_link.Link(Link, headers)
|
||||
print(Link)
|
||||
print('Start: ', Link)
|
||||
|
||||
# 获得所有论文详情页的链接
|
||||
Essay_Ol = soup.find('ol') # 获取论文列表
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user