import time import urllib import uuid from urllib.parse import urljoin import SD_link import SD_detail import SD_save # ==========获取论文详情页链接========== def Scrawl(Link, Article_data, Author_data): # 访问论文列表页 headers = SD_link.header() soup = SD_link.Link(Link, headers) print(Link) # 获得所有论文详情页的链接 Essay_Ol = soup.find('ol') # 获取论文列表 Essay_Li = Essay_Ol.find_all('li') # 获取全部论文详情页链接 # 爬取一页论文列表中所有的论文(50篇) for Essay_hrefs in Essay_Li: Essay_href = Essay_hrefs.find('a', itemprop='url') if Essay_href is not None: time.sleep(0.1) sub_Link = Essay_href.get('href') # 获取链接 sub_Link = urllib.parse.urljoin('https://advancesincontinuousanddiscretemodels.springeropen.com/', sub_Link) # ==========访问论文详情页========== sub_soup = SD_link.Link(sub_Link, headers) # 获取详情 article_id = str(uuid.uuid4()) # 标号 # 获取细节并且添加进对应列表 Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id)) Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data) # 放入json文件暂存(小文件) if Article_data: index = str(uuid.uuid4()) SD_save.save_data(Article_data, "Article_TS", index + ".json") print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Article_TS/') if Author_data: index = str(uuid.uuid4()) SD_save.save_data(Author_data, "Author_TS", index + ".json") print('Finished: ' + index + ' has been added to ./SpringerOpen_buffer/Author_TS/')