Chenxiao Xia 34fb579f7c Fix bugs
2023-09-16 18:46:52 +08:00

69 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import urllib
import SD_link
import SD_threads
import SD_save
from urllib.parse import urljoin
'''
爬取网站https://www.springeropen.com
========== 运行顺序 ==========
1、SD_main 获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接
2、SD_threads 多线程管控 -> 调用SD_scrawl
3、SD_scrawl 获取论文详情页链接 -> 调用SD_detail
4、SD_detail 获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件json暂存
5、SD_main 调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件
*6、SD_save(可选) 删除暂存区内部所有文件(注意备份)
'''
# 存放网页链接的空列表
Webs = [] # A list for web url
Links = [] # A list for links
# ==========访问论文列表页==========
# 获取数学类期刊网站链接
headers = SD_link.header()
soup = SD_link.Link('https://www.springeropen.com/journals', headers)
hrefs = soup.find('ol', id='Mathematics-list')
hrefs = hrefs.find_all('a')
for href in hrefs:
href = 'http:' + href.get('href') + '/articles'
sub_soup = SD_link.Link(href, headers)
# 获取当前期刊的文章列表页数
pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text()
pp = pp.split(' ')[-1]
# 修饰链接
url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=')
# 存入字典和列表
web = {
"url": url,
"page": int(pp)
}
Webs.append(web)
# 处理链接后存入待处理的链接列表
for web in Webs:
for page in range(1, web['page']+1):
link = web['url'] + str(page)
Links.append(link)
print('\nThe links have been stored!\n')
# 进入多线程池开始爬取
SD_threads.Threads(Links)
# json文件汇总
SD_save.Transf()
# ==========删除所有暂存的小文件(可选,注意备份)===========
SD_save.delete()