import uuid import calendar from datetime import datetime # ==========获取细节========== def Author_dict(soup, article_id, Author_list): info = soup.find('article', lang='en') author_info = info.find('div', id='author-information-content') article_info = info.find('div', class_='c-article-header') # Author authors = article_info.find('ul', class_='c-article-author-list') authors = authors.find_all('li', class_='c-article-author-list__item') for author in authors: # Name author = author.find('a').get_text() author = author.split(' ') author = [char.replace('-', '') for char in author] Firstname = author[0] Lastname = author[-1] Middlename = ''.join(author[1:-1]) if len(author[1:-1]) > 0 else None # Year Year = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() if info.find\ ('span', attrs={'data-test': 'article-publication-year'}) else None # Affiliation Affiliation = author_info.find('p', class_='c-article-author-affiliation__address').get_text() if author_info.find\ ('p', class_='c-article-author-affiliation__address') else None # Email Email = None # Can not reach the data # Input into dict author_data = { "author_id": str(uuid.uuid4()), "from_article": article_id, "first _name": Firstname, "last_name": Lastname, "middle_name": Middlename, "affiliation": [ { "year": Year, "affiliation": Affiliation, "email": Email } ] } Author_list.append(author_data) return Author_list def Article_dict(soup, url, article_id): info = soup.find('article', lang='en') article_info = info.find('div', class_='c-article-header') # Title Title = article_info.find('h1').get_text() # Author Author = [] # A new empty list author_list = article_info.find('ul', class_='c-article-author-list') authors = author_list.find_all('li', class_='c-article-author-list__item') for author in authors: author = author.find('a').get_text() author = [char.replace('-', '') for char in author] author = ''.join(author) Author.append(author) # Corresponding_author Corresponding_author = [] # A new empty list corresponding_author_list = info.find('p', id='corresponding-author-list') corresponding_authors = corresponding_author_list.find_all('a') if corresponding_authors is not None: for corresponding_author in corresponding_authors: corresponding_author = corresponding_author.get_text() corresponding_author = [char.replace('-', '') for char in corresponding_author] corresponding_author = ''.join(corresponding_author) Corresponding_author.append(corresponding_author) # Submitted_datetime & Published_datetime Time = [] def timeSet(time): input_date = datetime.strptime(time, "%d %B %Y") return input_date.strftime("%Y-%m-%d") time_list = info.find('ul', class_='c-bibliographic-information__list') times = time_list.find_all('time') for time in times: time = time.get_text() Time.append(time) Submit_date = timeSet(str(Time[0])) Publish_date = timeSet(str(Time[-1])) # keyword Keyword = [] # A new empty list keyword_list = info.find('ul', class_='c-article-subject-list') if keyword_list is not None: keywords = keyword_list.find_all('li') for keyword in keywords: keyword = keyword.get_text() Keyword.append(keyword) # MSC MSC = [] # SpringerOpen.com does not have MSC # DOI DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi') if DOI is not None: DOI = DOI.find('span', class_='c-bibliographic-information__value').get_text() # Publisher Publisher = 'springeropen.com' # Journal Journal = info.find('p', class_='c-article-info-details') Journal = Journal.find('i').get_text() # Volume Volume = info.find('span', attrs={'data-test': 'article-publication-year'}).get_text() # Issue Issue = info.find('p', class_='c-article-info-details') Issue = Issue.find('span', attrs={'data-test': 'article-number'}).get_text() # Page Page = None # Input into dict article_data = { "article_id": article_id, "title": Title, "authors": Author, "corresponding_authors": Corresponding_author, "submit_datetime": Submit_date, "publish_datetime": Publish_date, "keywords": Keyword, "MSC": MSC, "URL": url, "DOI": DOI, "publisher": Publisher, "journal": Journal, "volume": Volume, "issue": Issue, "page": Page, } return article_data