import requests from bs4 import BeautifulSoup import re import uuid from bs4 import Tag import json # 发送HTTP请求获取网页内容 url = 'https://www.worldscientific.com/doi/10.1142/S0219891623500017' response = requests.get(url) html_content = response.content # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html_content, 'html.parser') # 提取标题 title = soup.find(class_="citation__title").text # 打印标题 print(title) # 提取所有class为"author-type"的内容 author_type_tags = soup.find_all(class_="author-type") # 获取提取到的内容 author_type_list = [tag.text.strip() for tag in author_type_tags] # 输出提取的内容 print(author_type_list) for author_type_tag in author_type_tags: # 提取后续的

标签内容 affiliation = author_type_tag.find_next_sibling('p') if affiliation: content_affiliation = affiliation.get_text() print(content_affiliation) else: print("No

tag found after class=\"author-type\"") # 提取作者姓名,创建作者字典 author_tags = soup.find(class_="rlist--inline loa mobile-authors visible-xs").text # 去除额外空格 author_tags = author_tags.replace(" ", "") # 使用逗号拆分作者标签 author_list = author_tags.split("and") authors_dict = {} for i, author in enumerate(author_list, 1): # 去除多余空格 author = author.strip() authors_dict[f"author_{i}"] = author authors_dict[f"affiliation_{i}"]=affiliation print(authors_dict) # 提取关键词 # 找到包含keywords的div元素 keywords_div = soup.find('div', id='keywords') # 提取keywords内容 keywords = [] if keywords_div: keyword_items = keywords_div.find_all('a') keywords = [item.text for item in keyword_items] print(keywords) # 定义匹配要提取的div的正则表达式 pattern = re.compile(r'^

AMSC: .*
$') # 找到匹配正则表达式的div元素 divs = soup.find_all(lambda tag: tag.name == 'div' and re.match(pattern, str(tag))) # 提取每个div中的AMSC内容 for div in divs: amsc_text = div.text # 提取AMSC值(去除"AMSC: "前缀) amsc_values = amsc_text.replace('AMSC: ', '').strip().split(', ') print("AMSC", amsc_values) # 找到提交时间,定义匹配模式 pattern = re.compile('^received', re.IGNORECASE) # 查找匹配的div元素 received_div = soup.find('div', string=pattern) # 提取内容 if received_div: received_text = received_div.text print(received_text) # 找到发表时间,定义匹配模式 pattern1 = re.compile('^Published', re.IGNORECASE) # 查找匹配的div元素 Published_div = soup.find('div', string=pattern1) # 提取内容 if Published_div: Published_text = Published_div.text print(Published_text) # 查找DOI标签并提取DOI doi_element = soup.find("span", class_="epub-section__item").find("a") doi = doi_element['href'].split("doi.org")[-1] print("DOI:", doi) # 查找具有class为article__tocHeading的链接 links = soup.find_all('a', class_='article__tocHeading') # 遍历找到的链接 for link in links: text = link.text # 链接文本内容 # 使用正则表达式提取数据 match = re.search(r'Vol\. (\d+), No\. (\d+), pp\. (\d+-\d+) \((\d+)\)', text) if match: vol_number = match.group(1) issue_number = match.group(2) page_range = match.group(3) publication_year = match.group(4) # 打印提取的数据 print("Volume Number:", vol_number) print("Issue Number:", issue_number) print("Page Range:", page_range) print("Publication Year:", publication_year) else: print("No match found.") data={ "article_id":str(uuid.uuid4()), "Author":authors_dict, "Title":title, "received time":received_text, "publish time":Published_text, "keyeords":keywords, "Volume":vol_number, "issue":issue_number, "page":page_range, "DOI":doi, "url":"", "journal":"worldscientific", "MSC":amsc_values} class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, Tag): return str(obj) return super().default(obj) # 使用自定义JSON编码器进行序列化 with open("data.json", "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False, cls=CustomJSONEncoder)