diff --git a/WorldScientific b/WorldScientific new file mode 100644 index 0000000..3b268f2 --- /dev/null +++ b/WorldScientific @@ -0,0 +1,174 @@ +import requests +from bs4 import BeautifulSoup +import re +import uuid +from bs4 import Tag +import json + + +# 发送HTTP请求获取网页内容 +url = 'https://www.worldscientific.com/doi/10.1142/S0219891623500017' +response = requests.get(url) +html_content = response.content + +# 使用BeautifulSoup解析网页内容 +soup = BeautifulSoup(html_content, 'html.parser') + +# 提取标题 +title = soup.find(class_="citation__title").text +# 打印标题 +print(title) + + +# 提取所有class为"author-type"的内容 +author_type_tags = soup.find_all(class_="author-type") + +# 获取提取到的内容 +author_type_list = [tag.text.strip() for tag in author_type_tags] + +# 输出提取的内容 +print(author_type_list) + +for author_type_tag in author_type_tags: + # 提取后续的

标签内容 + affiliation = author_type_tag.find_next_sibling('p') + if affiliation: + content_affiliation = affiliation.get_text() + print(content_affiliation) + else: + print("No

tag found after class=\"author-type\"") + + + + + +# 提取作者姓名,创建作者字典 +author_tags = soup.find(class_="rlist--inline loa mobile-authors visible-xs").text + +# 去除额外空格 +author_tags = author_tags.replace(" ", "") + +# 使用逗号拆分作者标签 +author_list = author_tags.split("and") + +authors_dict = {} +for i, author in enumerate(author_list, 1): + # 去除多余空格 + author = author.strip() + authors_dict[f"author_{i}"] = author + authors_dict[f"affiliation_{i}"]=affiliation + +print(authors_dict) + +# 提取关键词 +# 找到包含keywords的div元素 +keywords_div = soup.find('div', id='keywords') + +# 提取keywords内容 +keywords = [] +if keywords_div: + keyword_items = keywords_div.find_all('a') + keywords = [item.text for item in keyword_items] + +print(keywords) + + + +# 定义匹配要提取的div的正则表达式 +pattern = re.compile(r'^

AMSC: .*
$') + +# 找到匹配正则表达式的div元素 +divs = soup.find_all(lambda tag: tag.name == 'div' and re.match(pattern, str(tag))) + +# 提取每个div中的AMSC内容 +for div in divs: + amsc_text = div.text + # 提取AMSC值(去除"AMSC: "前缀) + amsc_values = amsc_text.replace('AMSC: ', '').strip().split(', ') + + print("AMSC", amsc_values) + + +# 找到提交时间,定义匹配模式 +pattern = re.compile('^received', re.IGNORECASE) + +# 查找匹配的div元素 +received_div = soup.find('div', string=pattern) + +# 提取内容 +if received_div: + received_text = received_div.text + +print(received_text) + + + +# 找到发表时间,定义匹配模式 +pattern1 = re.compile('^Published', re.IGNORECASE) + +# 查找匹配的div元素 +Published_div = soup.find('div', string=pattern1) + +# 提取内容 +if Published_div: + Published_text = Published_div.text + +print(Published_text) + +# 查找DOI标签并提取DOI +doi_element = soup.find("span", class_="epub-section__item").find("a") +doi = doi_element['href'].split("doi.org")[-1] + +print("DOI:", doi) + + + +# 查找具有class为article__tocHeading的链接 +links = soup.find_all('a', class_='article__tocHeading') + +# 遍历找到的链接 +for link in links: + text = link.text # 链接文本内容 + + # 使用正则表达式提取数据 + match = re.search(r'Vol\. (\d+), No\. (\d+), pp\. (\d+-\d+) \((\d+)\)', text) + + if match: + vol_number = match.group(1) + issue_number = match.group(2) + page_range = match.group(3) + publication_year = match.group(4) + + # 打印提取的数据 + print("Volume Number:", vol_number) + print("Issue Number:", issue_number) + print("Page Range:", page_range) + print("Publication Year:", publication_year) + else: + print("No match found.") + +data={ + "article_id":str(uuid.uuid4()), + "Author":authors_dict, + "Title":title, + "received time":received_text, + "publish time":Published_text, + "keyeords":keywords, + "Volume":vol_number, + "issue":issue_number, + "page":page_range, + "DOI":doi, + "url":"", + "journal":"worldscientific", + "MSC":amsc_values} + + +class CustomJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Tag): + return str(obj) + return super().default(obj) + +# 使用自定义JSON编码器进行序列化 +with open("data.json", "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False, cls=CustomJSONEncoder)