10 years articles

2023-07-27 12:04:37 +08:00 · 2023-07-27 12:04:37 +08:00 · 20cf71530a
commit 20cf71530a
parent b2c845dc6e
1 changed files with 174 additions and 0 deletions
--- a/174
+++ b/174
@ -0,0 +1,174 @@
 import requests
 from bs4 import BeautifulSoup
 import re
 import uuid
 from bs4 import Tag
 import json
 # 发送HTTP请求获取网页内容
 url = 'https://www.worldscientific.com/doi/10.1142/S0219891623500017'
 response = requests.get(url)
 html_content = response.content
 # 使用BeautifulSoup解析网页内容
 soup = BeautifulSoup(html_content, 'html.parser')
 # 提取标题
 title = soup.find(class_="citation__title").text
 # 打印标题
 print(title)
 # 提取所有class为"author-type"的内容
 author_type_tags = soup.find_all(class_="author-type")
 # 获取提取到的内容
 author_type_list = [tag.text.strip() for tag in author_type_tags]
 # 输出提取的内容
 print(author_type_list)
 for author_type_tag in author_type_tags:
    # 提取后续的 <p> 标签内容
    affiliation = author_type_tag.find_next_sibling('p')
    if affiliation:
        content_affiliation = affiliation.get_text()
        print(content_affiliation)
    else:
        print("No <p> tag found after class=\"author-type\"")
 # 提取作者姓名，创建作者字典
 author_tags = soup.find(class_="rlist--inline loa mobile-authors visible-xs").text
 # 去除额外空格
 author_tags = author_tags.replace("  ", "")
 # 使用逗号拆分作者标签
 author_list = author_tags.split("and")
 authors_dict = {}
 for i, author in enumerate(author_list, 1):
    # 去除多余空格
    author = author.strip()
    authors_dict[f"author_{i}"] = author
    authors_dict[f"affiliation_{i}"]=affiliation
 print(authors_dict)
 # 提取关键词
 # 找到包含keywords的div元素
 keywords_div = soup.find('div', id='keywords')
 # 提取keywords内容
 keywords = []
 if keywords_div:
    keyword_items = keywords_div.find_all('a')
    keywords = [item.text for item in keyword_items]
 print(keywords)
 # 定义匹配要提取的div的正则表达式
 pattern = re.compile(r'^<div><b>AMSC: </b>.*</div>$')
 # 找到匹配正则表达式的div元素
 divs = soup.find_all(lambda tag: tag.name == 'div' and re.match(pattern, str(tag)))
 # 提取每个div中的AMSC内容
 for div in divs:
    amsc_text = div.text
    # 提取AMSC值（去除"AMSC: "前缀）
    amsc_values = amsc_text.replace('AMSC: ', '').strip().split(', ')
    print("AMSC", amsc_values)
 # 找到提交时间，定义匹配模式
 pattern = re.compile('^received', re.IGNORECASE)
 # 查找匹配的div元素
 received_div = soup.find('div', string=pattern)
 # 提取内容
 if received_div:
    received_text = received_div.text
 print(received_text)
 # 找到发表时间，定义匹配模式
 pattern1 = re.compile('^Published', re.IGNORECASE)
 # 查找匹配的div元素
 Published_div = soup.find('div', string=pattern1)
 # 提取内容
 if Published_div:
    Published_text = Published_div.text
 print(Published_text)
 # 查找DOI标签并提取DOI
 doi_element = soup.find("span", class_="epub-section__item").find("a")
 doi = doi_element['href'].split("doi.org")[-1]
 print("DOI:", doi)
 # 查找具有class为article__tocHeading的链接
 links = soup.find_all('a', class_='article__tocHeading')
 # 遍历找到的链接
 for link in links:
    text = link.text  # 链接文本内容
    # 使用正则表达式提取数据
    match = re.search(r'Vol\. (\d+), No\. (\d+), pp\. (\d+-\d+) \((\d+)\)', text)
    if match:
        vol_number = match.group(1)
        issue_number = match.group(2)
        page_range = match.group(3)
        publication_year = match.group(4)
        # 打印提取的数据
        print("Volume Number:", vol_number)
        print("Issue Number:", issue_number)
        print("Page Range:", page_range)
        print("Publication Year:", publication_year)
    else:
        print("No match found.")
 data={
    "article_id":str(uuid.uuid4()),
    "Author":authors_dict,
    "Title":title,
    "received time":received_text,
    "publish time":Published_text,
    "keyeords":keywords,
    "Volume":vol_number,
    "issue":issue_number,
    "page":page_range,
    "DOI":doi,
    "url":"",
    "journal":"worldscientific",
    "MSC":amsc_values}
 class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Tag):
            return str(obj)
        return super().default(obj)
 # 使用自定义JSON编码器进行序列化
 with open("data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False, cls=CustomJSONEncoder)