2023-09-26 23:37:43 +08:00

175 lines
4.3 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import re
import uuid
from bs4 import Tag
import json
# 发送HTTP请求获取网页内容
url = 'https://www.worldscientific.com/doi/10.1142/S0219891623500017'
response = requests.get(url)
html_content = response.content
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 提取标题
title = soup.find(class_="citation__title").text
# 打印标题
print(title)
# 提取所有class为"author-type"的内容
author_type_tags = soup.find_all(class_="author-type")
# 获取提取到的内容
author_type_list = [tag.text.strip() for tag in author_type_tags]
# 输出提取的内容
print(author_type_list)
for author_type_tag in author_type_tags:
# 提取后续的 <p> 标签内容
affiliation = author_type_tag.find_next_sibling('p')
if affiliation:
content_affiliation = affiliation.get_text()
print(content_affiliation)
else:
print("No <p> tag found after class=\"author-type\"")
# 提取作者姓名,创建作者字典
author_tags = soup.find(class_="rlist--inline loa mobile-authors visible-xs").text
# 去除额外空格
author_tags = author_tags.replace(" ", "")
# 使用逗号拆分作者标签
author_list = author_tags.split("and")
authors_dict = {}
for i, author in enumerate(author_list, 1):
# 去除多余空格
author = author.strip()
authors_dict[f"author_{i}"] = author
authors_dict[f"affiliation_{i}"]=affiliation
print(authors_dict)
# 提取关键词
# 找到包含keywords的div元素
keywords_div = soup.find('div', id='keywords')
# 提取keywords内容
keywords = []
if keywords_div:
keyword_items = keywords_div.find_all('a')
keywords = [item.text for item in keyword_items]
print(keywords)
# 定义匹配要提取的div的正则表达式
pattern = re.compile(r'^<div><b>AMSC: </b>.*</div>$')
# 找到匹配正则表达式的div元素
divs = soup.find_all(lambda tag: tag.name == 'div' and re.match(pattern, str(tag)))
# 提取每个div中的AMSC内容
for div in divs:
amsc_text = div.text
# 提取AMSC值去除"AMSC: "前缀)
amsc_values = amsc_text.replace('AMSC: ', '').strip().split(', ')
print("AMSC", amsc_values)
# 找到提交时间,定义匹配模式
pattern = re.compile('^received', re.IGNORECASE)
# 查找匹配的div元素
received_div = soup.find('div', string=pattern)
# 提取内容
if received_div:
received_text = received_div.text
print(received_text)
# 找到发表时间,定义匹配模式
pattern1 = re.compile('^Published', re.IGNORECASE)
# 查找匹配的div元素
Published_div = soup.find('div', string=pattern1)
# 提取内容
if Published_div:
Published_text = Published_div.text
print(Published_text)
# 查找DOI标签并提取DOI
doi_element = soup.find("span", class_="epub-section__item").find("a")
doi = doi_element['href'].split("doi.org")[-1]
print("DOI:", doi)
# 查找具有class为article__tocHeading的链接
links = soup.find_all('a', class_='article__tocHeading')
# 遍历找到的链接
for link in links:
text = link.text # 链接文本内容
# 使用正则表达式提取数据
match = re.search(r'Vol\. (\d+), No\. (\d+), pp\. (\d+-\d+) \((\d+)\)', text)
if match:
vol_number = match.group(1)
issue_number = match.group(2)
page_range = match.group(3)
publication_year = match.group(4)
# 打印提取的数据
print("Volume Number:", vol_number)
print("Issue Number:", issue_number)
print("Page Range:", page_range)
print("Publication Year:", publication_year)
else:
print("No match found.")
data={
"article_id":str(uuid.uuid4()),
"Author":authors_dict,
"Title":title,
"received time":received_text,
"publish time":Published_text,
"keyeords":keywords,
"Volume":vol_number,
"issue":issue_number,
"page":page_range,
"DOI":doi,
"url":"",
"journal":"worldscientific",
"MSC":amsc_values}
class CustomJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Tag):
return str(obj)
return super().default(obj)
# 使用自定义JSON编码器进行序列化
with open("data.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False, cls=CustomJSONEncoder)