159 lines
5.0 KiB
Python

import uuid
import calendar
# ==========获取细节==========
def Author_dict(soup, article_id, Author_list):
info = soup.find('article', lang='en')
author_info = info.find('div', id='author-information-content')
article_info = info.find('div', class_='c-article-header')
# Author
authors = article_info.find('ul', class_='c-article-author-list')
authors = authors.find_all('li', class_='c-article-author-list__item')
for author in authors:
# Name
author = author.find('a').get_text()
author = author.split(' ')
author = [char.replace('-', '') for char in author]
Firstname = author[0]
Lastname = author[-1]
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
# Year
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\
('span', attrs={'data-test': 'article-publication-year'}) else None
# Affiliation
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\
('p', class_='c-article-author-affiliation__address') else None
# Email
Email = None # Can not reach the data
# Input into dict
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"firstname": Firstname,
"lastname": Lastname,
"middlename": Middlename,
"affiliation": [
{
"year": Year,
"affiliation": Affiliation,
"email": Email
}
]
}
Author_list.append(author_data)
return Author_list
def Article_dict(soup, url, article_id):
info = soup.find('article', lang='en')
article_info = info.find('div', class_='c-article-header')
# Title
Title = article_info.find('h1').get_text()
# Author
Author = [] # A new empty list
author_list = article_info.find('ul', class_='c-article-author-list')
authors = author_list.find_all('li', class_='c-article-author-list__item')
for author in authors:
author = author.find('a').get_text()
author = [char.replace('-', '') for char in author]
author = ''.join(author)
Author.append(author)
# Corresponding_author
Corresponding_author = [] # A new empty list
corresponding_author_list = info.find('p', id='corresponding-author-list')
corresponding_authors = corresponding_author_list.find_all('a')
if corresponding_authors is not None:
for corresponding_author in corresponding_authors:
corresponding_author = corresponding_author.get_text()
corresponding_author = [char.replace('-', '') for char in corresponding_author]
corresponding_author = ''.join(corresponding_author)
Corresponding_author.append(corresponding_author)
# Submitted_datetime & Published_datetime
Time = []
def timeSet(time):
time = time.split(' ')
time[1] = str(list(calendar.month_name).index(time[1]))
time = time[2] + '-' + time[1] + '-' + time[0]
return time
time_list = info.find('ul', class_='c-bibliographic-information__list')
times = time_list.find_all('time')
for time in times:
time = time.get_text()
Time.append(time)
Submit_date = timeSet(str(Time[0]))
Publish_date = timeSet(str(Time[-1]))
# keyword
Keyword = [] # A new empty list
keyword_list = info.find('ul', class_='c-article-subject-list')
if keyword_list is not None:
keywords = keyword_list.find_all('li')
for keyword in keywords:
keyword = keyword.get_text()
Keyword.append(keyword)
# MSC
MSC = None # SpringerOpen.com does not have MSC
# DOI
DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
if DOI is not None:
DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text()
# Publisher
Publisher = 'springeropen.com'
# Journal
Journal = info.find('p', class_='c-article-info-details')
Journal = Journal.find('i').get_text()
# Volume
Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
# Issue
Issue = info.find('p', class_='c-article-info-details')
Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text()
# Page
Page = None
# Input into dict
article_data = {
"article_id": article_id,
"title": Title,
"authors": Author,
"corresponding_authors": Corresponding_author,
"submit_datetime": Submit_date,
"publish_datetime": Publish_date,
"keywords": Keyword,
"MSC": MSC,
"URL": url,
"DOI": DOI,
"publisher": Publisher,
"journal": Journal,
"volume": Volume,
"issue": Issue,
"page": Page,
}
return article_data