Optimization:
1. add special issue papers 2. optimized doi regular expression Bug fix: 1. multiple email problem Reminder: 1. author's email data structure changed due to multiple email problem
This commit is contained in:
parent
88bcbf5b8f
commit
ba3671b5fd
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers: 2023/08/08 - 4300
|
||||
Total Time via VPN w/100ms-delay: 254.04s
|
||||
Total number of papers: 2023/08/08 - 4339
|
||||
Total Time via VPN w/52ms-delay: 430.38s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -68,15 +68,17 @@ def process_volume(url):
|
||||
baseWeb = url[:url.rfind('/')] + "/"
|
||||
html = volume_response.text
|
||||
volume_soup = BeautifulSoup(html, "html.parser")
|
||||
ol_elements = volume_soup.find_all('ol')
|
||||
li_elements = volume_soup.find_all('ol')
|
||||
if not li_elements:
|
||||
li_elements = volume_soup.find_all('ul')
|
||||
|
||||
for ol in ol_elements:
|
||||
em_elements = ol.find_all('em')
|
||||
for li in li_elements:
|
||||
em_elements = li.find_all('em')
|
||||
if em_elements:
|
||||
articles.extend(em for em in em_elements)
|
||||
# Another html style
|
||||
else:
|
||||
i_elements = ol.find_all('i')
|
||||
i_elements = li.find_all('i')
|
||||
if i_elements:
|
||||
articles.extend(i for i in i_elements)
|
||||
else:
|
||||
@ -157,21 +159,26 @@ def process_article(title, article_url):
|
||||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
||||
|
||||
# Extract issue
|
||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||
issue = issue_match.group(1) if issue_match else None
|
||||
|
||||
# Extract volume
|
||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||
volume = str(volume_match.group(1)) if volume_match else None
|
||||
if not volume:
|
||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||
volume = str(volume_match.group(1)) if volume_match else None
|
||||
volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
|
||||
if volume_match:
|
||||
issue_number, volume = volume_match.groups()
|
||||
volume = str(volume)
|
||||
issue = "Special Issue " + str(issue_number)
|
||||
else:
|
||||
volume = None
|
||||
|
||||
# Extract pp
|
||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||
pp = pp_match.group(1) if pp_match else None
|
||||
|
||||
# Extract issue
|
||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||
issue = issue_match.group(1) if issue_match else None
|
||||
|
||||
# Extract submission date
|
||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
submitted_date = match.group(1) if match else None
|
||||
@ -207,7 +214,7 @@ def process_article(title, article_url):
|
||||
keywords = []
|
||||
|
||||
# Extract DOI
|
||||
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
||||
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||||
if not doi_match:
|
||||
doi_match = re.search(r'DOI: (.+)', html)
|
||||
doi = doi_match.group(1) if doi_match else None
|
||||
@ -222,18 +229,33 @@ def process_article(title, article_url):
|
||||
for row in table.find_all('tr'):
|
||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||
for cell in cells:
|
||||
if "email:" in cell:
|
||||
cell = cell.split("email:")
|
||||
email_list = str(cell[1]).split(',')
|
||||
cell = cell[0]
|
||||
elif "e-mail:" in cell:
|
||||
cell = cell.split("e-mail:")
|
||||
email_list = str(cell[1]).split(',')
|
||||
cell = cell[0]
|
||||
else:
|
||||
email_list = None
|
||||
|
||||
cell = re.split(r'[\r\n]+', cell)
|
||||
cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
|
||||
cell = [c.replace('\\newline', '') for c in cell]
|
||||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||||
|
||||
# Data processing
|
||||
if cell[0]:
|
||||
authors.append(cell[0])
|
||||
name = re.split(r'[ .]', cell[0])
|
||||
affiliation = ', '.join(cell[1:-1])
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
||||
email = email_match.group() if email_match else None
|
||||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||
emails = []
|
||||
if email_list:
|
||||
for email in email_list:
|
||||
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||||
emails.append(email_match.group()) if email_match else None
|
||||
|
||||
author_data = {
|
||||
"author_id": str(uuid.uuid4()),
|
||||
@ -244,7 +266,7 @@ def process_article(title, article_url):
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
"email": email
|
||||
"email": emails
|
||||
}]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
@ -267,23 +289,38 @@ def process_article(title, article_url):
|
||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||
matches = matches.split("<p>")
|
||||
for match in matches:
|
||||
if "email:" in match:
|
||||
match = match.split("email:")
|
||||
email_list = str(match[1]).split(',')
|
||||
match = match[0]
|
||||
elif "e-mail:" in match:
|
||||
match = match.split("e-mail:")
|
||||
email_list = str(match[1]).split(',')
|
||||
match = match[0]
|
||||
else:
|
||||
email_list = None
|
||||
|
||||
match = re.sub(r'<[^>]+>', '', match)
|
||||
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
||||
if match_type == 0:
|
||||
match = match.split("\\n")
|
||||
else:
|
||||
match = match.split("\n")
|
||||
match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
|
||||
match = [m.replace('\\newline', '') for m in match]
|
||||
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||||
|
||||
# Data processing
|
||||
if match[0]:
|
||||
authors.append(match[0])
|
||||
name = re.split(r'[ .]', match[0])
|
||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
||||
email = email_match.group() if email_match else None
|
||||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||
emails = []
|
||||
if email_list:
|
||||
for email in email_list:
|
||||
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||||
emails.append(email_match.group()) if email_match else None
|
||||
|
||||
author_data = {
|
||||
"author_id": str(uuid.uuid4()),
|
||||
@ -294,7 +331,7 @@ def process_article(title, article_url):
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": affiliation,
|
||||
"email": email
|
||||
"email": emails
|
||||
}]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
@ -339,14 +376,21 @@ def process_article(title, article_url):
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
url_list = []
|
||||
|
||||
# Get all general volumes url
|
||||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||||
response = requests.get(index)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Find all the URL links under the first Volume section
|
||||
volume_links = soup.select('font > a[href]')
|
||||
# Extract and store the URLs in a list using list comprehension
|
||||
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
|
||||
url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])
|
||||
|
||||
# Get all special issues url
|
||||
index = "https://ejde.math.txstate.edu/special-toc.html"
|
||||
response = requests.get(index)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
special_links = soup.find_all("a", href=True)
|
||||
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
|
||||
|
||||
# Initialize lists
|
||||
authorData = []
|
||||
@ -361,7 +405,7 @@ authorNum = 0
|
||||
articleNum = 0
|
||||
|
||||
batch_size = 100 # Number of articles to process before saving
|
||||
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
|
||||
executor = ThreadPoolExecutor(max_workers=len(url_list)) # Set the number of worker threads
|
||||
|
||||
# Process each URL using multithreading
|
||||
futures = [executor.submit(process_volume, url) for url in url_list]
|
||||
@ -393,12 +437,12 @@ while failedData:
|
||||
if len(articleData) > 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
print("Total fetched paper:", len(articleData) + articleNum)
|
||||
print("Total fetched paper:", len(articleData) + articleNum)
|
||||
|
||||
if len(authorData) > 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
print("Total fetched author:", len(authorData) + authorNum)
|
||||
print("Total fetched author:", len(authorData) + authorNum)
|
||||
|
||||
# Save error record
|
||||
if len(totallyFailedData) > 0:
|
||||
@ -418,4 +462,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejde_save.Transf()
|
||||
ejde_save.delete()
|
||||
ejde_save.delete()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user