10 years articles
This commit is contained in:
parent
b2c845dc6e
commit
20cf71530a
174
WorldScientific
Normal file
174
WorldScientific
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from bs4 import Tag
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# 发送HTTP请求获取网页内容
|
||||||
|
url = 'https://www.worldscientific.com/doi/10.1142/S0219891623500017'
|
||||||
|
response = requests.get(url)
|
||||||
|
html_content = response.content
|
||||||
|
|
||||||
|
# 使用BeautifulSoup解析网页内容
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# 提取标题
|
||||||
|
title = soup.find(class_="citation__title").text
|
||||||
|
# 打印标题
|
||||||
|
print(title)
|
||||||
|
|
||||||
|
|
||||||
|
# 提取所有class为"author-type"的内容
|
||||||
|
author_type_tags = soup.find_all(class_="author-type")
|
||||||
|
|
||||||
|
# 获取提取到的内容
|
||||||
|
author_type_list = [tag.text.strip() for tag in author_type_tags]
|
||||||
|
|
||||||
|
# 输出提取的内容
|
||||||
|
print(author_type_list)
|
||||||
|
|
||||||
|
for author_type_tag in author_type_tags:
|
||||||
|
# 提取后续的 <p> 标签内容
|
||||||
|
affiliation = author_type_tag.find_next_sibling('p')
|
||||||
|
if affiliation:
|
||||||
|
content_affiliation = affiliation.get_text()
|
||||||
|
print(content_affiliation)
|
||||||
|
else:
|
||||||
|
print("No <p> tag found after class=\"author-type\"")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 提取作者姓名,创建作者字典
|
||||||
|
author_tags = soup.find(class_="rlist--inline loa mobile-authors visible-xs").text
|
||||||
|
|
||||||
|
# 去除额外空格
|
||||||
|
author_tags = author_tags.replace(" ", "")
|
||||||
|
|
||||||
|
# 使用逗号拆分作者标签
|
||||||
|
author_list = author_tags.split("and")
|
||||||
|
|
||||||
|
authors_dict = {}
|
||||||
|
for i, author in enumerate(author_list, 1):
|
||||||
|
# 去除多余空格
|
||||||
|
author = author.strip()
|
||||||
|
authors_dict[f"author_{i}"] = author
|
||||||
|
authors_dict[f"affiliation_{i}"]=affiliation
|
||||||
|
|
||||||
|
print(authors_dict)
|
||||||
|
|
||||||
|
# 提取关键词
|
||||||
|
# 找到包含keywords的div元素
|
||||||
|
keywords_div = soup.find('div', id='keywords')
|
||||||
|
|
||||||
|
# 提取keywords内容
|
||||||
|
keywords = []
|
||||||
|
if keywords_div:
|
||||||
|
keyword_items = keywords_div.find_all('a')
|
||||||
|
keywords = [item.text for item in keyword_items]
|
||||||
|
|
||||||
|
print(keywords)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 定义匹配要提取的div的正则表达式
|
||||||
|
pattern = re.compile(r'^<div><b>AMSC: </b>.*</div>$')
|
||||||
|
|
||||||
|
# 找到匹配正则表达式的div元素
|
||||||
|
divs = soup.find_all(lambda tag: tag.name == 'div' and re.match(pattern, str(tag)))
|
||||||
|
|
||||||
|
# 提取每个div中的AMSC内容
|
||||||
|
for div in divs:
|
||||||
|
amsc_text = div.text
|
||||||
|
# 提取AMSC值(去除"AMSC: "前缀)
|
||||||
|
amsc_values = amsc_text.replace('AMSC: ', '').strip().split(', ')
|
||||||
|
|
||||||
|
print("AMSC", amsc_values)
|
||||||
|
|
||||||
|
|
||||||
|
# 找到提交时间,定义匹配模式
|
||||||
|
pattern = re.compile('^received', re.IGNORECASE)
|
||||||
|
|
||||||
|
# 查找匹配的div元素
|
||||||
|
received_div = soup.find('div', string=pattern)
|
||||||
|
|
||||||
|
# 提取内容
|
||||||
|
if received_div:
|
||||||
|
received_text = received_div.text
|
||||||
|
|
||||||
|
print(received_text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 找到发表时间,定义匹配模式
|
||||||
|
pattern1 = re.compile('^Published', re.IGNORECASE)
|
||||||
|
|
||||||
|
# 查找匹配的div元素
|
||||||
|
Published_div = soup.find('div', string=pattern1)
|
||||||
|
|
||||||
|
# 提取内容
|
||||||
|
if Published_div:
|
||||||
|
Published_text = Published_div.text
|
||||||
|
|
||||||
|
print(Published_text)
|
||||||
|
|
||||||
|
# 查找DOI标签并提取DOI
|
||||||
|
doi_element = soup.find("span", class_="epub-section__item").find("a")
|
||||||
|
doi = doi_element['href'].split("doi.org")[-1]
|
||||||
|
|
||||||
|
print("DOI:", doi)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 查找具有class为article__tocHeading的链接
|
||||||
|
links = soup.find_all('a', class_='article__tocHeading')
|
||||||
|
|
||||||
|
# 遍历找到的链接
|
||||||
|
for link in links:
|
||||||
|
text = link.text # 链接文本内容
|
||||||
|
|
||||||
|
# 使用正则表达式提取数据
|
||||||
|
match = re.search(r'Vol\. (\d+), No\. (\d+), pp\. (\d+-\d+) \((\d+)\)', text)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
vol_number = match.group(1)
|
||||||
|
issue_number = match.group(2)
|
||||||
|
page_range = match.group(3)
|
||||||
|
publication_year = match.group(4)
|
||||||
|
|
||||||
|
# 打印提取的数据
|
||||||
|
print("Volume Number:", vol_number)
|
||||||
|
print("Issue Number:", issue_number)
|
||||||
|
print("Page Range:", page_range)
|
||||||
|
print("Publication Year:", publication_year)
|
||||||
|
else:
|
||||||
|
print("No match found.")
|
||||||
|
|
||||||
|
data={
|
||||||
|
"article_id":str(uuid.uuid4()),
|
||||||
|
"Author":authors_dict,
|
||||||
|
"Title":title,
|
||||||
|
"received time":received_text,
|
||||||
|
"publish time":Published_text,
|
||||||
|
"keyeords":keywords,
|
||||||
|
"Volume":vol_number,
|
||||||
|
"issue":issue_number,
|
||||||
|
"page":page_range,
|
||||||
|
"DOI":doi,
|
||||||
|
"url":"",
|
||||||
|
"journal":"worldscientific",
|
||||||
|
"MSC":amsc_values}
|
||||||
|
|
||||||
|
|
||||||
|
class CustomJSONEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, Tag):
|
||||||
|
return str(obj)
|
||||||
|
return super().default(obj)
|
||||||
|
|
||||||
|
# 使用自定义JSON编码器进行序列化
|
||||||
|
with open("data.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False, cls=CustomJSONEncoder)
|
||||||
Loading…
x
Reference in New Issue
Block a user