47 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import urllib
import uuid
from urllib.parse import urljoin
import SD_link
import SD_detail
import SD_save
# ==========获取论文详情页链接==========
def Scrawl(Link, Article_data, Author_data):
# 访问论文列表页
headers = SD_link.header()
soup = SD_link.Link(Link, headers)
print('Start: ', Link)
# 获得所有论文详情页的链接
Essay_Ol = soup.find('ol') # 获取论文列表
Essay_Li = Essay_Ol.find_all('li') # 获取全部论文详情页链接
# 爬取一页论文列表中所有的论文50篇
for Essay_hrefs in Essay_Li:
Essay_href = Essay_hrefs.find('a', itemprop='url')
if Essay_href is not None:
time.sleep(0.1)
sub_Link = Essay_href.get('href') # 获取链接
sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link)
# ==========访问论文详情页==========
sub_soup = SD_link.Link(sub_Link, headers) # 获取详情
article_id = str(uuid.uuid4()) # 标号
# 获取细节并且添加进对应列表
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
# 放入json文件暂存小文件
if Article_data:
index = str(uuid.uuid4())
SD_save.save_data(Article_data, "Article_TS", index + ".json")
print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/')
if Author_data:
index = str(uuid.uuid4())
SD_save.save_data(Author_data, "Author_TS", index + ".json")
print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/')