148 lines
4.6 KiB
Python
148 lines
4.6 KiB
Python
import uuid
|
|
|
|
# ==========获取细节==========
|
|
def Author_dict(soup, article_id, Author_list):
|
|
info = soup.find('article', lang='en')
|
|
author_info = info.find('div', id='author-information-content')
|
|
article_info = info.find('div', class_='c-article-header')
|
|
|
|
# Author
|
|
authors = article_info.find('ul', class_='c-article-author-list')
|
|
authors = authors.find_all('li', class_='c-article-author-list__item')
|
|
for author in authors:
|
|
# Name
|
|
author = author.find('a').get_text()
|
|
author = author.split(' ')
|
|
author = [char.replace('-', '') for char in author]
|
|
|
|
Firstname = author[0]
|
|
Lastname = author[-1]
|
|
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
|
|
|
# Year
|
|
Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
|
|
|
|
# Affiliation
|
|
Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text()
|
|
|
|
# Email
|
|
Email = None # Can not reach the data
|
|
|
|
# Input into dict
|
|
author_data = {
|
|
"author_id": str(uuid.uuid4()),
|
|
"from_article": article_id,
|
|
"firstname": Firstname,
|
|
"lastname": Lastname,
|
|
"middlename": Middlename,
|
|
"affiliation": [
|
|
{
|
|
"year": Year,
|
|
"affiliation": Affiliation,
|
|
"email": Email
|
|
}
|
|
]
|
|
}
|
|
|
|
Author_list.append(author_data)
|
|
|
|
return Author_list
|
|
|
|
|
|
def Article_dict(soup, url, article_id):
|
|
info = soup.find('article', lang='en')
|
|
article_info = info.find('div', class_='c-article-header')
|
|
|
|
# Title
|
|
Title = article_info.find('h1').get_text()
|
|
|
|
# Author
|
|
Author = [] # A new empty list
|
|
|
|
author_list = article_info.find('ul', class_='c-article-author-list')
|
|
authors = author_list.find_all('li', class_='c-article-author-list__item')
|
|
for author in authors:
|
|
author = author.find('a').get_text()
|
|
author = [char.replace('-', '') for char in author]
|
|
author = ''.join(author)
|
|
Author.append(author)
|
|
|
|
# Corresponding_author
|
|
Corresponding_author = [] # A new empty list
|
|
|
|
corresponding_author_list = info.find('p', id='corresponding-author-list')
|
|
corresponding_authors = corresponding_author_list.find_all('a')
|
|
if Corresponding_author is not None:
|
|
for corresponding_author in corresponding_authors:
|
|
corresponding_author = corresponding_author.get_text()
|
|
corresponding_author = [char.replace('-', '') for char in corresponding_author]
|
|
corresponding_author = ''.join(corresponding_author)
|
|
Corresponding_author.append(corresponding_author)
|
|
|
|
# Submitted_datetime & Published_datetime
|
|
Time = []
|
|
|
|
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
|
times = time_list.find_all('time')
|
|
for time in times:
|
|
time = time.get_text()
|
|
Time.append(time)
|
|
|
|
Submitted_date = Time[0]
|
|
Publish_date = Time[-1]
|
|
|
|
# keyword
|
|
Keyword = [] # A new empty list
|
|
|
|
keyword_list = info.find('ul', class_='c-article-subject-list')
|
|
if keyword_list is not None:
|
|
keywords = keyword_list.find_all('li')
|
|
for keyword in keywords:
|
|
keyword = keyword.get_text()
|
|
Keyword.append(keyword)
|
|
|
|
# MSC
|
|
MSC = None # SpringerOpen.com does not have MSC
|
|
|
|
# DOI
|
|
DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
|
|
if DOI is not None:
|
|
DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text()
|
|
|
|
# Publisher
|
|
Publisher = 'springeropen.com'
|
|
|
|
# Journal
|
|
Journal = info.find('p', class_='c-article-info-details')
|
|
Journal = Journal.find('i').get_text()
|
|
|
|
# Volume
|
|
Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text()
|
|
|
|
# Issue
|
|
Issue = info.find('p', class_='c-article-info-details')
|
|
Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text()
|
|
|
|
# Page
|
|
Page = None
|
|
|
|
# Input into dict
|
|
article_data = {
|
|
"article_id": article_id,
|
|
"title": Title,
|
|
"authors": Author,
|
|
"corresponding_authors": Corresponding_author,
|
|
"submit_datetime": Submitted_date,
|
|
"publish_datetime": Publish_date,
|
|
"keywords": Keyword,
|
|
"MSC": MSC,
|
|
"URL": url,
|
|
"DOI": DOI,
|
|
"publisher": Publisher,
|
|
"journal": Journal,
|
|
"volume": Volume,
|
|
"issue": Issue,
|
|
"page": Page,
|
|
}
|
|
|
|
return article_data |