From 35f5f2ac5e39a53061369a86993cf62682da1356 Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 11:42:02 +0800 Subject: [PATCH 1/7] Optimization: clustered error files into a folder --- 01_EJDE_spider/ejde_main.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 1876b22..86b8b68 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -155,6 +155,8 @@ def process_article(title, article_url): # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None + if not volume: + volume = str(volume_match.group(2)) if volume_match else None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) @@ -374,15 +376,15 @@ if len(authorData) > 0: # Save error record if len(totallyFailedData) > 0: - ejde_save.save_data(failedData, "", "Failed_article_record.json") + ejde_save.save_data(failedData, "Error", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: - ejde_save.save_data(failedVolData, "", "Failed_volume_record.json") + ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json") print("Total failed fetching volume:", len(failedVolData)) if len(failedFormatData) > 0: - ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") + ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) # Total running time From 3e78e9f48e6c1be92bb861504bfc4484cec3012e Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 14:26:59 +0800 Subject: [PATCH 2/7] Optimization: 1. added new regular expression format for volume 2. added new strip method for msc 3. deleted blank-space author 4. optimized middle name strip method 5. added new matching pattern for no table author list 6. added exception storing for AUTHOR SEARCHING ERROR Bug fix: 1. error record saving --- 01_EJDE_spider/ejde_main.py | 113 +++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 86b8b68..3c9c71a 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed 爬取网站:'ejde.math.txstate.edu' Total number of papers: 2023/08/08 - 4300 - Total Time via VPN w/119ms-delay: 441.80s + Total Time via VPN w/100ms-delay: 254.04s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -156,7 +156,8 @@ def process_article(title, article_url): volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None if not volume: - volume = str(volume_match.group(2)) if volume_match else None + volume_match = re.search(r'Vol\. (\d+)', article_text) + volume = str(volume_match.group(1)) if volume_match else None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) @@ -183,7 +184,8 @@ def process_article(title, article_url): if not msc_match: msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html) if msc_match: - msc = msc_match.group(1).strip().strip('.').strip() + msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip()) + msc = msc.strip('.').strip() msc = re.split(r', |;', msc) else: msc = None @@ -217,68 +219,85 @@ def process_article(title, article_url): for cell in cells: cell = cell.split("\n") cell = [element.replace('email: ', '') for element in cell] - cell = [c.strip() for c in cell] + cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing - authors.append(cell[0]) - name = cell[0].split(" ") - affiliation = ', '.join(cell[1:-1]) - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) - email = email_match.group() if email_match else None + if cell[0]: + authors.append(cell[0]) + name = cell[0].split(" ") + affiliation = ', '.join(cell[1:-1]) + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": name[0], + "lastname": name[-1], + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( + name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) # If no author table else: + match_type = 0 pattern = r'
(.*?)
' matches = str(re.findall(pattern, html, re.DOTALL)) + if len(matches) < 5: + match_type = 1 + last_p_tag = str(article_soup.find_all('p')[-1]) + pattern = r'

(.*?)


' + matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip() + if matches: matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '') matches = matches.split("

") - for match in matches: match = re.sub(r'<[^>]+>', '', match) match = match.lstrip("\\n ").rstrip("\\n ").strip() - match = match.split("\\n") + if match_type == 0: + match = match.split("\\n") + else: + match = match.split("\n") match = [element.replace('email: ', '') for element in match] - match = [m.strip() for m in match] + match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing - authors.append(match[0]) - name = match[0].split(" ") - affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() - affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() - email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) - email = email_match.group() if email_match else None + if match[0]: + authors.append(match[0]) + name = match[0].split(" ") + affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() + affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() + email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) + email = email_match.group() if email_match else None - author_data = { - "author_id": str(uuid.uuid4()), - "from_article": article_id, - "firstname": name[0], - "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None, - "affiliation": [{ - "year": volume, - "affiliation": affiliation, - "email": email - }] - } - authorData.append(author_data) + author_data = { + "author_id": str(uuid.uuid4()), + "from_article": article_id, + "firstname": name[0], + "lastname": name[-1], + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( + name) > 2 else None, + "affiliation": [{ + "year": volume, + "affiliation": affiliation, + "email": email + }] + } + authorData.append(author_data) else: print("AUTHOR SEARCHING ERROR:", article_url) - return + fail = { + "title": title, + "URL": article_url + } + failedFormatData.append(fail) # Article info article_data = { @@ -376,7 +395,7 @@ if len(authorData) > 0: # Save error record if len(totallyFailedData) > 0: - ejde_save.save_data(failedData, "Error", "Failed_article_record.json") + ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: From f97195c94dab9ace48e72f7c7f2d30ef9bea6ecc Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 18:05:15 +0800 Subject: [PATCH 3/7] Bug Fix: handled exception when the volume website has no title --- 01_EJDE_spider/ejde_main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 3c9c71a..53228d4 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -152,6 +152,11 @@ def process_article(title, article_url): article_soup = BeautifulSoup(html, 'html.parser') article_text = article_soup.get_text() + # Extract title if title == None + if not title: + title_match = re.search(r"

(.*?)

", article_text) + title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None + # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None From 68a755a633b155aa10cb8c048cc7358b12b24304 Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 19:13:33 +0800 Subject: [PATCH 4/7] Bug Fix: 1. added split author data when hits "\n" 2. added split name by "." 3. added method extracting author info when have 3 hr tag --- 01_EJDE_spider/ejde_main.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 53228d4..2ea14c9 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -222,14 +222,14 @@ def process_article(title, article_url): for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: - cell = cell.split("\n") + cell = re.split(r'[\r\n]+', cell) cell = [element.replace('email: ', '') for element in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing if cell[0]: authors.append(cell[0]) - name = cell[0].split(" ") + name = re.split(r'[ .]', cell[0]) affiliation = ', '.join(cell[1:-1]) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1]) @@ -252,7 +252,11 @@ def process_article(title, article_url): # If no author table else: match_type = 0 - pattern = r'


(.*?)
' + hr_count = len(soup.find_all('hr')) + if hr_count < 3: + pattern = r'
(.*?)
' + else: + pattern = r'
(?:.*
)(.*)(?=
)' matches = str(re.findall(pattern, html, re.DOTALL)) if len(matches) < 5: match_type = 1 @@ -276,7 +280,7 @@ def process_article(title, article_url): # Data processing if match[0]: authors.append(match[0]) - name = match[0].split(" ") + name = re.split(r'[ .]', match[0]) affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) From b1eba69085d8b8c8ecdd31eaf1fb68bdf47b9322 Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 19:16:03 +0800 Subject: [PATCH 5/7] Bug Fix: 1. hr_count soup should be article_soup --- 01_EJDE_spider/ejde_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 2ea14c9..d13393f 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -252,7 +252,7 @@ def process_article(title, article_url): # If no author table else: match_type = 0 - hr_count = len(soup.find_all('hr')) + hr_count = len(article_soup.find_all('hr')) if hr_count < 3: pattern = r'
(.*?)
' else: From ed469ee362cef9708f37a54526815a2075276875 Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 19:52:40 +0800 Subject: [PATCH 6/7] Bug Fix: 1. reformat regular expressions for keyword matching --- 01_EJDE_spider/ejde_main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index d13393f..bd8429c 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -196,11 +196,11 @@ def process_article(title, article_url): msc = None # Extract KeyWords - keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?:
|

|$)', html, re.DOTALL) if not keywords_match: - keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL) + keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL) if keywords_match: - keywords = keywords_match.group(1).strip().replace('\n', '') + keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords] else: From 083e6c87eb4da86d996adc791f3b618ddebe076e Mon Sep 17 00:00:00 2001 From: ldy <1913292237@qq.com> Date: Fri, 11 Aug 2023 20:45:04 +0800 Subject: [PATCH 7/7] Optimization: strip "\newline" in author name --- 01_EJDE_spider/ejde_main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index bd8429c..17577cf 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -223,7 +223,7 @@ def process_article(title, article_url): cells = [cell.text.strip() for cell in row.find_all('td')] for cell in cells: cell = re.split(r'[\r\n]+', cell) - cell = [element.replace('email: ', '') for element in cell] + cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell] cell = [re.sub(r'\s+', ' ', c).strip() for c in cell] # Data processing @@ -240,8 +240,8 @@ def process_article(title, article_url): "from_article": article_id, "firstname": name[0], "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( - name) > 2 else None, + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0 + ] if len(name) > 2 else None, "affiliation": [{ "year": volume, "affiliation": affiliation, @@ -269,12 +269,12 @@ def process_article(title, article_url): matches = matches.split("

") for match in matches: match = re.sub(r'<[^>]+>', '', match) - match = match.lstrip("\\n ").rstrip("\\n ").strip() + match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip() if match_type == 0: match = match.split("\\n") else: match = match.split("\n") - match = [element.replace('email: ', '') for element in match] + match = [m.replace('email: ', '').replace('\\newline', '') for m in match] match = [re.sub(r'\s+', ' ', m).strip() for m in match] # Data processing @@ -291,8 +291,8 @@ def process_article(title, article_url): "from_article": article_id, "firstname": name[0], "lastname": name[-1], - "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len( - name) > 2 else None, + "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0 + ] if len(name) > 2 else None, "affiliation": [{ "year": volume, "affiliation": affiliation, @@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() -# ejde_save.delete() +ejde_save.delete()