diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 4ae686f..f36a618 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -7,13 +7,13 @@ import ejde_save from retrying import retry from datetime import datetime from bs4 import BeautifulSoup -from concurrent.futures import ThreadPoolExecutor, as_completed, wait +from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' Total number of papers: 2023/08/08 - 4300 - Total Time via VPN w/119ms-delay: 441.80s + Total Time via VPN w/100ms-delay: 254.04s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -152,9 +152,17 @@ def process_article(title, article_url): article_soup = BeautifulSoup(html, 'html.parser') article_text = article_soup.get_text() + # Extract title if title == None + if not title: + title_match = re.search(r"
", article_text)
+ title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
+
# Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
+ if not volume:
+ volume_match = re.search(r'Vol\. (\d+)', article_text)
+ volume = str(volume_match.group(1)) if volume_match else None
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@@ -181,17 +189,18 @@ def process_article(title, article_url):
if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match:
- msc = msc_match.group(1).strip().strip('.').strip()
+ msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
+ msc = msc.strip('.').strip()
msc = re.split(r', |;', msc)
else:
msc = []
# Extract KeyWords
- keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
+ keywords_match = re.search(r'Key Words: (.*?)(?:
|
|$)', html, re.DOTALL)
if not keywords_match:
- keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL)
+ keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if keywords_match:
- keywords = keywords_match.group(1).strip().replace('\n', '')
+ keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else:
@@ -213,72 +222,89 @@ def process_article(title, article_url):
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
- cell = cell.split("\n")
- cell = [element.replace('email: ', '') for element in cell]
- cell = [c.strip() for c in cell]
+ cell = re.split(r'[\r\n]+', cell)
+ cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
+ cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
# Data processing
- authors.append(cell[0])
- name = cell[0].split(" ")
- middle_name = ''.join(name[1:-1]) if name[1:-1] else None
- affiliation = ', '.join(cell[1:-1])
- affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
- email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
- email = email_match.group() if email_match else None
+ if cell[0]:
+ authors.append(cell[0])
+ name = re.split(r'[ .]', cell[0])
+ affiliation = ', '.join(cell[1:-1])
+ affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
+ email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
+ email = email_match.group() if email_match else None
- author_data = {
- "author_id": str(uuid.uuid4()),
- "from_article": [article_id],
- "first_name": name[0],
- "last_name": name[-1],
- "middle_name": middle_name,
- "affiliation": [{
- "year": volume,
- "affiliation": affiliation,
- "email": email
- }]
- }
- authorData.append(author_data)
+ author_data = {
+ "author_id": str(uuid.uuid4()),
+ "from_article": article_id,
+ "first_name": name[0],
+ "last_name": name[-1],
+ "middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
+ "affiliation": [{
+ "year": volume,
+ "affiliation": affiliation,
+ "email": email
+ }]
+ }
+ authorData.append(author_data)
# If no author table
else:
- pattern = r'
(.*?)
") - for match in matches: match = re.sub(r'<[^>]+>', '', match) - match = match.lstrip("\\n ").rstrip("\\n ").strip() - match = match.split("\\n") - match = [element.replace('email: ', '') for element in match] - match = [m.strip() for m in match] + match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() + if match_type == 0: + match = match.split("\\n") + else: + match = match.split("\n") + match = [m.replace('email: ', '').replace('\\newline', '') for m in match] + match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing - authors.append(match[0]) - name = match[0].split(" ") - middle_name = ''.join(name[1:-1]) if name[1:-1] else None - affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) - email = email_match.group() if email_match else None + if match[0]: + authors.append(match[0]) + name = re.split(r'[ .]', match[0]) + affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": [article_id], - "first_name": name[0], - "last_name": name[-1], - "middle_name": middle_name, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "first_name": name[0], + "last_name": name[-1], + "middle_name": ''.join(name[1:-1]) if name[1:-1] else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) else: print("AUTHOR SEARCHING ERROR:", article_url) - return + fail = { + "title": title, + "URL": article_url + } + failedFormatData.append(fail) # Article info article_data = { @@ -347,8 +373,6 @@ for future in as_completed(futures): except Exception as vol_err: print("VOLUME PROCESSING ERROR:", str(vol_err)) -wait(futures) - # Retry failed processing paper print("START RETRYING:", len(failedData)) while failedData: @@ -378,15 +402,15 @@ if len(authorData) > 0: # Save error record if len(totallyFailedData) > 0: - ejde_save.save_data(failedData, "", "Failed_article_record.json") + ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: - ejde_save.save_data(failedVolData, "", "Failed_volume_record.json") + ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json") print("Total failed fetching volume:", len(failedVolData)) if len(failedFormatData) > 0: - ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") + ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) # Total running time @@ -394,4 +418,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -ejde_save.delete() +ejde_save.delete() \ No newline at end of file