Changed the code to unify the time format
This commit is contained in:
parent
ee0f956645
commit
01c1a7d978
@ -1,4 +1,6 @@
|
|||||||
import uuid
|
import uuid
|
||||||
|
import calendar
|
||||||
|
|
||||||
|
|
||||||
# ==========获取细节==========
|
# ==========获取细节==========
|
||||||
def Author_dict(soup, article_id, Author_list):
|
def Author_dict(soup, article_id, Author_list):
|
||||||
@ -20,10 +22,12 @@ def Author_dict(soup, article_id, Author_list):
|
|||||||
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
||||||
|
|
||||||
# Year
|
# Year
|
||||||
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
|
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\
|
||||||
|
('span', attrs={'data-test': 'article-publication-year'}) else None
|
||||||
|
|
||||||
# Affiliation
|
# Affiliation
|
||||||
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
|
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\
|
||||||
|
('p', class_='c-article-author-affiliation__address') else None
|
||||||
|
|
||||||
# Email
|
# Email
|
||||||
Email = None # Can not reach the data
|
Email = None # Can not reach the data
|
||||||
@ -72,7 +76,7 @@ def Article_dict(soup, url, article_id):
|
|||||||
|
|
||||||
corresponding_author_list = info.find('p', id='corresponding-author-list')
|
corresponding_author_list = info.find('p', id='corresponding-author-list')
|
||||||
corresponding_authors = corresponding_author_list.find_all('a')
|
corresponding_authors = corresponding_author_list.find_all('a')
|
||||||
if Corresponding_author is not None:
|
if corresponding_authors is not None:
|
||||||
for corresponding_author in corresponding_authors:
|
for corresponding_author in corresponding_authors:
|
||||||
corresponding_author = corresponding_author.get_text()
|
corresponding_author = corresponding_author.get_text()
|
||||||
corresponding_author = [char.replace('-', '') for char in corresponding_author]
|
corresponding_author = [char.replace('-', '') for char in corresponding_author]
|
||||||
@ -82,14 +86,20 @@ def Article_dict(soup, url, article_id):
|
|||||||
# Submitted_datetime & Published_datetime
|
# Submitted_datetime & Published_datetime
|
||||||
Time = []
|
Time = []
|
||||||
|
|
||||||
|
def timeSet(time):
|
||||||
|
time = time.split(' ')
|
||||||
|
time[1] = str(list(calendar.month_name).index(time[1]))
|
||||||
|
time = time[2] + '-' + time[1] + '-' + time[0]
|
||||||
|
return time
|
||||||
|
|
||||||
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
||||||
times = time_list.find_all('time')
|
times = time_list.find_all('time')
|
||||||
for time in times:
|
for time in times:
|
||||||
time = time.get_text()
|
time = time.get_text()
|
||||||
Time.append(time)
|
Time.append(time)
|
||||||
|
|
||||||
Submit_date = Time[0]
|
Submit_date = timeSet(str(Time[0]))
|
||||||
Publish_date = Time[-1]
|
Publish_date = timeSet(str(Time[-1]))
|
||||||
|
|
||||||
# keyword
|
# keyword
|
||||||
Keyword = [] # A new empty list
|
Keyword = [] # A new empty list
|
||||||
@ -145,4 +155,4 @@ def Article_dict(soup, url, article_id):
|
|||||||
"page": Page,
|
"page": Page,
|
||||||
}
|
}
|
||||||
|
|
||||||
return article_data
|
return article_data
|
||||||
|
|||||||
@ -12,7 +12,7 @@ def Scrawl(Link, Article_data, Author_data):
|
|||||||
# 访问论文列表页
|
# 访问论文列表页
|
||||||
headers = SD_link.header()
|
headers = SD_link.header()
|
||||||
soup = SD_link.Link(Link, headers)
|
soup = SD_link.Link(Link, headers)
|
||||||
print(Link)
|
print('Start: ', Link)
|
||||||
|
|
||||||
# 获得所有论文详情页的链接
|
# 获得所有论文详情页的链接
|
||||||
Essay_Ol = soup.find('ol') # 获取论文列表
|
Essay_Ol = soup.find('ol') # 获取论文列表
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user