47 lines
1.8 KiB
Python
47 lines
1.8 KiB
Python
import time
|
||
import urllib
|
||
import uuid
|
||
from urllib.parse import urljoin
|
||
|
||
import SD_link
|
||
import SD_detail
|
||
import SD_save
|
||
|
||
# ==========获取论文详情页链接==========
|
||
def Scrawl(Link, Article_data, Author_data):
|
||
# 访问论文列表页
|
||
headers = SD_link.header()
|
||
soup = SD_link.Link(Link, headers)
|
||
print('Start: ', Link)
|
||
|
||
# 获得所有论文详情页的链接
|
||
Essay_Ol = soup.find('ol') # 获取论文列表
|
||
Essay_Li = Essay_Ol.find_all('li') # 获取全部论文详情页链接
|
||
|
||
# 爬取一页论文列表中所有的论文(50篇)
|
||
for Essay_hrefs in Essay_Li:
|
||
Essay_href = Essay_hrefs.find('a', itemprop='url')
|
||
if Essay_href is not None:
|
||
time.sleep(0.1)
|
||
|
||
sub_Link = Essay_href.get('href') # 获取链接
|
||
sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link)
|
||
|
||
# ==========访问论文详情页==========
|
||
sub_soup = SD_link.Link(sub_Link, headers) # 获取详情
|
||
article_id = str(uuid.uuid4()) # 标号
|
||
|
||
# 获取细节并且添加进对应列表
|
||
Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
|
||
Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
|
||
|
||
# 放入json文件暂存(小文件)
|
||
if Article_data:
|
||
index = str(uuid.uuid4())
|
||
SD_save.save_data(Article_data, "Article_TS", index + ".json")
|
||
print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/')
|
||
|
||
if Author_data:
|
||
index = str(uuid.uuid4())
|
||
SD_save.save_data(Author_data, "Author_TS", index + ".json")
|
||
print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/') |