import urllib import SD_link import SD_threads import SD_save from urllib.parse import urljoin ''' 爬取网站:https://www.springeropen.com ==========运行顺序========== 1、SD_main 获取SpringOpen网站下所有数学类期刊的链接 -> 获取期刊内部论文列表的链接 2、SD_threads 多线程管控 -> 调用SD_scrawl 3、SD_scrawl 获取论文详情页链接 -> 调用SD_detail 4、SD_detail 获取论文详情页内容并处理信息 -> 调用SD_save -> 存入小文件(json)暂存 5、SD_main 调用SD_save -> 从本地浏览暂存的小文件筛选后存入不同年份的大文件 *6、SD_save(可选) 删除暂存区内部所有文件(注意备份) ''' # 存放网页链接的空列表 Webs = [] # A list for web url Links = [] # A list for links # 存放爬取数据的空列表 Article_data = [] Author_data = [] # ==========访问论文列表页========== # 获取数学类期刊网站链接 headers = SD_link.header() soup = SD_link.Link('https://www.springeropen.com/journals', headers) hrefs = soup.find('ol', id='Mathematics-list') hrefs = hrefs.find_all('a') for href in hrefs: href = 'http:' + href.get('href') + '/articles' sub_soup = SD_link.Link(href, headers) # 获取当前期刊的文章列表页数 pp = sub_soup.find('p', class_='u-text-sm u-reset-margin').get_text() pp = pp.split(' ')[-1] # 修饰链接 url = urllib.parse.urljoin(href, 'articles?searchType=journalSearch&sort=PubDate&page=') # 存入字典和列表 web = { "url": url, "page": int(pp) } Webs.append(web) # 处理链接后存入待处理的链接列表 for web in Webs: for page in range(1, web['page']+1): link = web['url'] + str(page) Links.append(link) print('\nThe links have been stored!\n') # 进入多线程池开始爬取 SD_threads.Threads(Links, Article_data, Author_data) # json文件汇总 SD_save.Transf() # # ==========删除所有暂存的小文件(可选,注意备份)=========== # SD_save.delete('./SpringerOpen_buffer/Article_TS/') # SD_save.delete('./SpringerOpen_buffer/Author_TS/')