import requests
from bs4 import BeautifulSoup
import re
import uuid
from bs4 import Tag
import json
# 发送HTTP请求获取网页内容
url = 'https://www.worldscientific.com/doi/10.1142/S0219891623500017'
response = requests.get(url)
html_content = response.content
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 提取标题
title = soup.find(class_="citation__title").text
# 打印标题
print(title)
# 提取所有class为"author-type"的内容
author_type_tags = soup.find_all(class_="author-type")
# 获取提取到的内容
author_type_list = [tag.text.strip() for tag in author_type_tags]
# 输出提取的内容
print(author_type_list)
for author_type_tag in author_type_tags:
# 提取后续的
标签内容
affiliation = author_type_tag.find_next_sibling('p')
if affiliation:
content_affiliation = affiliation.get_text()
print(content_affiliation)
else:
print("No
tag found after class=\"author-type\"")
# 提取作者姓名,创建作者字典
author_tags = soup.find(class_="rlist--inline loa mobile-authors visible-xs").text
# 去除额外空格
author_tags = author_tags.replace(" ", "")
# 使用逗号拆分作者标签
author_list = author_tags.split("and")
authors_dict = {}
for i, author in enumerate(author_list, 1):
# 去除多余空格
author = author.strip()
authors_dict[f"author_{i}"] = author
authors_dict[f"affiliation_{i}"]=affiliation
print(authors_dict)
# 提取关键词
# 找到包含keywords的div元素
keywords_div = soup.find('div', id='keywords')
# 提取keywords内容
keywords = []
if keywords_div:
keyword_items = keywords_div.find_all('a')
keywords = [item.text for item in keyword_items]
print(keywords)
# 定义匹配要提取的div的正则表达式
pattern = re.compile(r'^
AMSC: .*
$')
# 找到匹配正则表达式的div元素
divs = soup.find_all(lambda tag: tag.name == 'div' and re.match(pattern, str(tag)))
# 提取每个div中的AMSC内容
for div in divs:
amsc_text = div.text
# 提取AMSC值(去除"AMSC: "前缀)
amsc_values = amsc_text.replace('AMSC: ', '').strip().split(', ')
print("AMSC", amsc_values)
# 找到提交时间,定义匹配模式
pattern = re.compile('^received', re.IGNORECASE)
# 查找匹配的div元素
received_div = soup.find('div', string=pattern)
# 提取内容
if received_div:
received_text = received_div.text
print(received_text)
# 找到发表时间,定义匹配模式
pattern1 = re.compile('^Published', re.IGNORECASE)
# 查找匹配的div元素
Published_div = soup.find('div', string=pattern1)
# 提取内容
if Published_div:
Published_text = Published_div.text
print(Published_text)
# 查找DOI标签并提取DOI
doi_element = soup.find("span", class_="epub-section__item").find("a")
doi = doi_element['href'].split("doi.org")[-1]
print("DOI:", doi)
# 查找具有class为article__tocHeading的链接
links = soup.find_all('a', class_='article__tocHeading')
# 遍历找到的链接
for link in links:
text = link.text # 链接文本内容
# 使用正则表达式提取数据
match = re.search(r'Vol\. (\d+), No\. (\d+), pp\. (\d+-\d+) \((\d+)\)', text)
if match:
vol_number = match.group(1)
issue_number = match.group(2)
page_range = match.group(3)
publication_year = match.group(4)
# 打印提取的数据
print("Volume Number:", vol_number)
print("Issue Number:", issue_number)
print("Page Range:", page_range)
print("Publication Year:", publication_year)
else:
print("No match found.")
data={
"article_id":str(uuid.uuid4()),
"Author":authors_dict,
"Title":title,
"received time":received_text,
"publish time":Published_text,
"keyeords":keywords,
"Volume":vol_number,
"issue":issue_number,
"page":page_range,
"DOI":doi,
"url":"",
"journal":"worldscientific",
"MSC":amsc_values}
class CustomJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Tag):
return str(obj)
return super().default(obj)
# 使用自定义JSON编码器进行序列化
with open("data.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False, cls=CustomJSONEncoder)