Optimization:
1. add special issue papers 2. optimized doi regular expression Bug fix: 1. multiple email problem Reminder: 1. author's email data structure changed due to multiple email problem
This commit is contained in:
parent
88bcbf5b8f
commit
ba3671b5fd
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
Total number of papers: 2023/08/08 - 4300
|
Total number of papers: 2023/08/08 - 4339
|
||||||
Total Time via VPN w/100ms-delay: 254.04s
|
Total Time via VPN w/52ms-delay: 430.38s
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
@ -68,15 +68,17 @@ def process_volume(url):
|
|||||||
baseWeb = url[:url.rfind('/')] + "/"
|
baseWeb = url[:url.rfind('/')] + "/"
|
||||||
html = volume_response.text
|
html = volume_response.text
|
||||||
volume_soup = BeautifulSoup(html, "html.parser")
|
volume_soup = BeautifulSoup(html, "html.parser")
|
||||||
ol_elements = volume_soup.find_all('ol')
|
li_elements = volume_soup.find_all('ol')
|
||||||
|
if not li_elements:
|
||||||
|
li_elements = volume_soup.find_all('ul')
|
||||||
|
|
||||||
for ol in ol_elements:
|
for li in li_elements:
|
||||||
em_elements = ol.find_all('em')
|
em_elements = li.find_all('em')
|
||||||
if em_elements:
|
if em_elements:
|
||||||
articles.extend(em for em in em_elements)
|
articles.extend(em for em in em_elements)
|
||||||
# Another html style
|
# Another html style
|
||||||
else:
|
else:
|
||||||
i_elements = ol.find_all('i')
|
i_elements = li.find_all('i')
|
||||||
if i_elements:
|
if i_elements:
|
||||||
articles.extend(i for i in i_elements)
|
articles.extend(i for i in i_elements)
|
||||||
else:
|
else:
|
||||||
@ -157,21 +159,26 @@ def process_article(title, article_url):
|
|||||||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
||||||
|
|
||||||
|
# Extract issue
|
||||||
|
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||||
|
issue = issue_match.group(1) if issue_match else None
|
||||||
|
|
||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
|
||||||
volume = str(volume_match.group(1)) if volume_match else None
|
|
||||||
if not volume:
|
|
||||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||||
volume = str(volume_match.group(1)) if volume_match else None
|
volume = str(volume_match.group(1)) if volume_match else None
|
||||||
|
if not volume:
|
||||||
|
volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
|
||||||
|
if volume_match:
|
||||||
|
issue_number, volume = volume_match.groups()
|
||||||
|
volume = str(volume)
|
||||||
|
issue = "Special Issue " + str(issue_number)
|
||||||
|
else:
|
||||||
|
volume = None
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||||
pp = pp_match.group(1) if pp_match else None
|
pp = pp_match.group(1) if pp_match else None
|
||||||
|
|
||||||
# Extract issue
|
|
||||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
|
||||||
issue = issue_match.group(1) if issue_match else None
|
|
||||||
|
|
||||||
# Extract submission date
|
# Extract submission date
|
||||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
submitted_date = match.group(1) if match else None
|
submitted_date = match.group(1) if match else None
|
||||||
@ -207,7 +214,7 @@ def process_article(title, article_url):
|
|||||||
keywords = []
|
keywords = []
|
||||||
|
|
||||||
# Extract DOI
|
# Extract DOI
|
||||||
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||||||
if not doi_match:
|
if not doi_match:
|
||||||
doi_match = re.search(r'DOI: (.+)', html)
|
doi_match = re.search(r'DOI: (.+)', html)
|
||||||
doi = doi_match.group(1) if doi_match else None
|
doi = doi_match.group(1) if doi_match else None
|
||||||
@ -222,18 +229,33 @@ def process_article(title, article_url):
|
|||||||
for row in table.find_all('tr'):
|
for row in table.find_all('tr'):
|
||||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
|
if "email:" in cell:
|
||||||
|
cell = cell.split("email:")
|
||||||
|
email_list = str(cell[1]).split(',')
|
||||||
|
cell = cell[0]
|
||||||
|
elif "e-mail:" in cell:
|
||||||
|
cell = cell.split("e-mail:")
|
||||||
|
email_list = str(cell[1]).split(',')
|
||||||
|
cell = cell[0]
|
||||||
|
else:
|
||||||
|
email_list = None
|
||||||
|
|
||||||
cell = re.split(r'[\r\n]+', cell)
|
cell = re.split(r'[\r\n]+', cell)
|
||||||
cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
|
cell = [c.replace('\\newline', '') for c in cell]
|
||||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if cell[0]:
|
if cell[0]:
|
||||||
authors.append(cell[0])
|
authors.append(cell[0])
|
||||||
name = re.split(r'[ .]', cell[0])
|
name = re.split(r'[ .]', cell[0])
|
||||||
affiliation = ', '.join(cell[1:-1])
|
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||||
email = email_match.group() if email_match else None
|
emails = []
|
||||||
|
if email_list:
|
||||||
|
for email in email_list:
|
||||||
|
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||||||
|
emails.append(email_match.group()) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
@ -244,7 +266,7 @@ def process_article(title, article_url):
|
|||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
"email": email
|
"email": emails
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
@ -267,23 +289,38 @@ def process_article(title, article_url):
|
|||||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||||
matches = matches.split("<p>")
|
matches = matches.split("<p>")
|
||||||
for match in matches:
|
for match in matches:
|
||||||
|
if "email:" in match:
|
||||||
|
match = match.split("email:")
|
||||||
|
email_list = str(match[1]).split(',')
|
||||||
|
match = match[0]
|
||||||
|
elif "e-mail:" in match:
|
||||||
|
match = match.split("e-mail:")
|
||||||
|
email_list = str(match[1]).split(',')
|
||||||
|
match = match[0]
|
||||||
|
else:
|
||||||
|
email_list = None
|
||||||
|
|
||||||
match = re.sub(r'<[^>]+>', '', match)
|
match = re.sub(r'<[^>]+>', '', match)
|
||||||
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
||||||
if match_type == 0:
|
if match_type == 0:
|
||||||
match = match.split("\\n")
|
match = match.split("\\n")
|
||||||
else:
|
else:
|
||||||
match = match.split("\n")
|
match = match.split("\n")
|
||||||
match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
|
match = [m.replace('\\newline', '') for m in match]
|
||||||
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if match[0]:
|
if match[0]:
|
||||||
authors.append(match[0])
|
authors.append(match[0])
|
||||||
name = re.split(r'[ .]', match[0])
|
name = re.split(r'[ .]', match[0])
|
||||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||||
email = email_match.group() if email_match else None
|
emails = []
|
||||||
|
if email_list:
|
||||||
|
for email in email_list:
|
||||||
|
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||||||
|
emails.append(email_match.group()) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
@ -294,7 +331,7 @@ def process_article(title, article_url):
|
|||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
"email": email
|
"email": emails
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
authorData.append(author_data)
|
||||||
@ -339,14 +376,21 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
url_list = []
|
||||||
|
|
||||||
|
# Get all general volumes url
|
||||||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||||||
response = requests.get(index)
|
response = requests.get(index)
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
# Find all the URL links under the first Volume section
|
|
||||||
volume_links = soup.select('font > a[href]')
|
volume_links = soup.select('font > a[href]')
|
||||||
# Extract and store the URLs in a list using list comprehension
|
url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])
|
||||||
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
|
|
||||||
|
# Get all special issues url
|
||||||
|
index = "https://ejde.math.txstate.edu/special-toc.html"
|
||||||
|
response = requests.get(index)
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
special_links = soup.find_all("a", href=True)
|
||||||
|
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
|
||||||
|
|
||||||
# Initialize lists
|
# Initialize lists
|
||||||
authorData = []
|
authorData = []
|
||||||
@ -361,7 +405,7 @@ authorNum = 0
|
|||||||
articleNum = 0
|
articleNum = 0
|
||||||
|
|
||||||
batch_size = 100 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
futures = [executor.submit(process_volume, url) for url in url_list]
|
futures = [executor.submit(process_volume, url) for url in url_list]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user