76 lines
2.2 KiB
Python
76 lines
2.2 KiB
Python
import urllib
|
||
|
||
import SD_header
|
||
import SD_link
|
||
import SD_threads
|
||
import SD_save
|
||
from urllib.parse import urljoin
|
||
|
||
'''
|
||
爬取网站:https://www.springeropen.com
|
||
|
||
==========运行顺序==========
|
||
1、SD_main 获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接
|
||
2、SD_threads 多线程管控 -> 调用SD_scrawl
|
||
3、SD_scrawl 获取论文详情页链接 -> 调用SD_detail
|
||
4、SD_detail 获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件(json)暂存
|
||
5、SD_main 调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||
*6、SD_save(可选) 删除暂存区内部所有文件(注意备份)
|
||
'''
|
||
|
||
|
||
# 存放网页链接的空列表
|
||
Links = [] # A list for links
|
||
Webs = [] # A list for web url
|
||
|
||
# 存放爬取数据的空列表
|
||
Article_data = []
|
||
Author_data = []
|
||
|
||
# ==========访问论文列表页==========
|
||
# 获取数学类期刊网站链接
|
||
headers = SD_header.header()
|
||
soup = SD_link.Link('https://www.springeropen.com/journals', headers)
|
||
|
||
hrefs = soup.find('ol', id='Mathematics-list')
|
||
hrefs = hrefs.find_all('a')
|
||
for href in hrefs:
|
||
href = 'http:' + href.get('href') + '/articles'
|
||
sub_soup = SD_link.Link(href, headers)
|
||
|
||
# 获取当前期刊的文章列表页数
|
||
pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text()
|
||
pp = pp.split(' ')[-1]
|
||
|
||
# 修饰链接
|
||
url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=')
|
||
|
||
# 存入字典和列表
|
||
web = {
|
||
"url": url,
|
||
"page": int(pp)
|
||
}
|
||
Webs.append(web)
|
||
|
||
# 处理链接后存入待处理的链接列表
|
||
for web in Webs:
|
||
for page in range(1, web['page']+1):
|
||
link = web['url'] + str(page)
|
||
Links.append(link)
|
||
|
||
print('\nThe links have been stored!\n')
|
||
|
||
# 进入多线程池开始爬取
|
||
SD_threads.Threads(Links, Article_data, Author_data)
|
||
|
||
# json文件汇总
|
||
SD_save.Transf()
|
||
|
||
# # ==========删除所有暂存的小文件(可选,注意备份)===========
|
||
# SD_save.delete('./SpringerOpen_buffer/Article_TS/')
|
||
# SD_save.delete('./SpringerOpen_buffer/Author_TS/')
|
||
|
||
|
||
|
||
|