Optimization:

1. add special issue papers 2. optimized doi regular expression Bug fix: 1. multiple email problem Reminder: 1. author's email data structure changed due to multiple email problem
2023-08-19 21:15:12 +08:00 · 2023-08-19 21:15:12 +08:00 · ba3671b5fd
commit ba3671b5fd
parent 88bcbf5b8f
1 changed files with 78 additions and 34 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
    爬取网站：'ejde.math.txstate.edu'

-    Total number of papers: 2023/08/08 - 4300
-    Total Time via VPN w/100ms-delay: 254.04s
+    Total number of papers: 2023/08/08 - 4339
+    Total Time via VPN w/52ms-delay: 430.38s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -68,15 +68,17 @@ def process_volume(url):
                baseWeb = url[:url.rfind('/')] + "/"
                html = volume_response.text
                volume_soup = BeautifulSoup(html, "html.parser")
-                ol_elements = volume_soup.find_all('ol')
+                li_elements = volume_soup.find_all('ol')
+                if not li_elements:
+                    li_elements = volume_soup.find_all('ul')

-                for ol in ol_elements:
-                    em_elements = ol.find_all('em')
+                for li in li_elements:
+                    em_elements = li.find_all('em')
                    if em_elements:
                        articles.extend(em for em in em_elements)
                    # Another html style
                    else:
-                        i_elements = ol.find_all('i')
+                        i_elements = li.find_all('i')
                        if i_elements:
                            articles.extend(i for i in i_elements)
                        else:
@ -157,21 +159,26 @@ def process_article(title, article_url):
            title_match = re.search(r"<h3>(.*?)<p>", article_text)
            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None

+        # Extract issue
+        issue_match = re.search(r'No\. (\d+)', article_text)
+        issue = issue_match.group(1) if issue_match else None
+
        # Extract volume
-        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
+        volume_match = re.search(r'Vol\. (\d+)', article_text)
        volume = str(volume_match.group(1)) if volume_match else None
        if not volume:
-            volume_match = re.search(r'Vol\. (\d+)', article_text)
-            volume = str(volume_match.group(1)) if volume_match else None
+            volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
+            if volume_match:
+                issue_number, volume = volume_match.groups()
+                volume = str(volume)
+                issue = "Special Issue " + str(issue_number)
+            else:
+                volume = None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
        pp = pp_match.group(1) if pp_match else None

-        # Extract issue
-        issue_match = re.search(r'No\. (\d+)', article_text)
-        issue = issue_match.group(1) if issue_match else None
-
        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        submitted_date = match.group(1) if match else None
@ -207,7 +214,7 @@ def process_article(title, article_url):
            keywords = []

        # Extract DOI
-        doi_match = re.search(r'DOI: (.+)(?=<)', html)
+        doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None
@ -222,18 +229,33 @@ def process_article(title, article_url):
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
+                    if "email:" in cell:
+                        cell = cell.split("email:")
+                        email_list = str(cell[1]).split(',')
+                        cell = cell[0]
+                    elif "e-mail:" in cell:
+                        cell = cell.split("e-mail:")
+                        email_list = str(cell[1]).split(',')
+                        cell = cell[0]
+                    else:
+                        email_list = None
+
                    cell = re.split(r'[\r\n]+', cell)
-                    cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
+                    cell = [c.replace('\\newline', '') for c in cell]
                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]

                    # Data processing
                    if cell[0]:
                        authors.append(cell[0])
                        name = re.split(r'[ .]', cell[0])
-                        affiliation = ', '.join(cell[1:-1])
-                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
-                        email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
-                        email = email_match.group() if email_match else None
+                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
+                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
+                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
+                        emails = []
+                        if email_list:
+                            for email in email_list:
+                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
+                                emails.append(email_match.group()) if email_match else None

                        author_data = {
                            "author_id": str(uuid.uuid4()),
@ -244,7 +266,7 @@ def process_article(title, article_url):
                            "affiliation": [{
                                "year": volume,
                                "affiliation": affiliation,
-                                "email": email
+                                "email": emails
                            }]
                        }
                        authorData.append(author_data)
@ -267,23 +289,38 @@ def process_article(title, article_url):
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
                for match in matches:
+                    if "email:" in match:
+                        match = match.split("email:")
+                        email_list = str(match[1]).split(',')
+                        match = match[0]
+                    elif "e-mail:" in match:
+                        match = match.split("e-mail:")
+                        email_list = str(match[1]).split(',')
+                        match = match[0]
+                    else:
+                        email_list = None
+
                    match = re.sub(r'<[^>]+>', '', match)
                    match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
                    if match_type == 0:
                        match = match.split("\\n")
                    else:
                        match = match.split("\n")
-                    match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
+                    match = [m.replace('\\newline', '') for m in match]
                    match = [re.sub(r'\s+', ' ', m).strip() for m in match]

                    # Data processing
                    if match[0]:
                        authors.append(match[0])
                        name = re.split(r'[ .]', match[0])
-                        affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
-                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
-                        email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
-                        email = email_match.group() if email_match else None
+                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
+                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
+                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
+                        emails = []
+                        if email_list:
+                            for email in email_list:
+                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
+                                emails.append(email_match.group()) if email_match else None

                        author_data = {
                            "author_id": str(uuid.uuid4()),
@ -294,7 +331,7 @@ def process_article(title, article_url):
                            "affiliation": [{
                                "year": volume,
                                "affiliation": affiliation,
-                                "email": email
+                                "email": emails
                            }]
                        }
                        authorData.append(author_data)
@ -339,14 +376,21 @@ def process_article(title, article_url):


 start_time = time.time()
+url_list = []
+
+# Get all general volumes url
 index = "https://ejde.math.txstate.edu/indexleft.html"
 response = requests.get(index)
 soup = BeautifulSoup(response.content, 'html.parser')
-
-# Find all the URL links under the first Volume section
 volume_links = soup.select('font > a[href]')
-# Extract and store the URLs in a list using list comprehension
-url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
+url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])
+
+# Get all special issues url
+index = "https://ejde.math.txstate.edu/special-toc.html"
+response = requests.get(index)
+soup = BeautifulSoup(response.content, 'html.parser')
+special_links = soup.find_all("a", href=True)
+url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])

 # Initialize lists
 authorData = []
@ -361,7 +405,7 @@ authorNum = 0
 articleNum = 0

 batch_size = 100  # Number of articles to process before saving
-executor = ThreadPoolExecutor(max_workers=25)  # Set the number of worker threads
+executor = ThreadPoolExecutor(max_workers=len(url_list))  # Set the number of worker threads

 # Process each URL using multithreading
 futures = [executor.submit(process_volume, url) for url in url_list]
@ -393,12 +437,12 @@ while failedData:
 if len(articleData) > 0:
    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
-    print("Total fetched paper:", len(articleData) + articleNum)
+print("Total fetched paper:", len(articleData) + articleNum)

 if len(authorData) > 0:
    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
-    print("Total fetched author:", len(authorData) + authorNum)
+print("Total fetched author:", len(authorData) + authorNum)

 # Save error record
 if len(totallyFailedData) > 0:
@ -418,4 +462,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))

 # Transfer to large file and delete the temporary storage files
 ejde_save.Transf()
-ejde_save.delete()
+ejde_save.delete()