ScholarDataMining/02_EJQTDE_spider/ejqtde_scrawler.py

import time
import uuid
import re
import urllib

from selenium.webdriver.edge.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin


# Get the information in the webpage through selenium
def source(driver, num):
    if driver.find_elements(by='id', value='columnRight'):
        html_code = driver.page_source
        soup = BeautifulSoup(html_code, 'html.parser')
        return soup
    elif num == 5:
        print('Out of times!')
        driver.quit()
        return None
    else:
        num += 1
        time.sleep(3)
        return source(driver, num)


# Get the links of the authors' information
def author_links(Data):
    Author_links = []
    Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&amp;'
                                      'paramtipus_ertek=person_data&amp;param_ertek=\d+')
    Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
    for Author_href in Author_hrefs:
        Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
        Author_links.append(Author_href)

    return Author_links


# Get the information of the authors
def author_detail(Data, Year, article_id, Author_list):
    # Name
    author = Data.find('p', class_='publication_head').get_text()

    author = author.split(',')
    author = [char.replace(' ', '') for char in author]

    Firstname = author[0]
    Lastname = author[-1]
    Middlename = ''.join(author[1:-1]) if author[1:-1] else None

    # infor
    table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
    Td = table.find_all('td')
    line = [td for td in Td]

    # Affiliation
    Affiliation = line[1].get_text()

    # Email
    Email = line[0].find('a').get('href')

    author_data = {
        "author_id": str(uuid.uuid4()),
        "from_article": article_id,
        "firstname": Firstname,
        "lastname": Lastname,
        "middlename": Middlename,
        "affiliation": [
            {
                "year": Year,
                "affiliation": Affiliation,
                "email": Email
            }
        ]
    }

    Author_list.append(author_data)
    return Author_list


# Get the article's information
def article_detail(Data, URL, article_id, Aricle_list):
    # Title
    font = Data.find('font', attrs={'size': '+1'})
    Title = font.find('b').get_text()

    # Author and Corresponding_authors
    author_pattern = re.compile(r'periodica\.html\?periodica=1&amp;'
                                r'paramtipus_ertek=person_data&amp;param_ertek=\d+"><b>(.*?)</b>')
    Author = re.findall(author_pattern, str(Data))
    Corresponding_author = Author[-1]  # Corresponding_authors
    del Author[-1]

    # Submit_datetime and publish_datetime
    def timeSet(time):
        time = time.split('-')
        time[1] = time[1].strip('0')
        time = time[0] + '-' + time[1] + '-' + time[2]
        return time

    time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
    time = re.findall(r'\d+-\d+-\d+', str(time))
    Submit_date = timeSet(time[0]) if time[0] else None
    Publish_date = timeSet(time[1]) if time[1] else None

    # Keyword
    Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
    Keyword = Keyword.split(', ') if Keyword is not None else None

    # MSC
    MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
    MSC = MSC.split(', ') if MSC is not None else None

    # DOI
    if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
        DOI = re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))[0]
    else:
        DOI = None

    # Publisher
    Publisher = 'www.math.u-szeged.hu/ejqtde'

    # Journal
    Journal = 'Electronic Journal of Qualitative Theory of Differential Equations'

    # Volume
    Volume = re.findall(r'<b>(\d+)</b>', str(Data))[0]

    # Issue and page
    result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
    Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
    Page = re.findall(r'\d+, (\d+-\d+)', result)[0]

    article_data = {
        "article_id": article_id,
        "title": Title,
        "authors": Author,
        "corresponding_authors": Corresponding_author,
        "submit_datetime": Submit_date,
        "publish_datetime": Publish_date,
        "keywords": Keyword,
        "MSC": MSC,
        "URL": URL,
        "DOI": DOI,
        "publisher": Publisher,
        "journal": Journal,
        "volume": Volume,
        "issue": Issue,
        "page": Page,
    }

    Aricle_list.append(article_data)
    return Aricle_list


# Main code of scrawler
def scrawler(URL, lock, Article_list, Author_list):
    print('Start: ', URL)
    driver = webdriver.Edge(options=options)
    driver.get(URL)

    # Enter the detail page
    Max_retryTimes = 3
    Essay_data = source(driver, Max_retryTimes)
    if Essay_data is not None:
        article_id = str(uuid.uuid4())
        Article_list = article_detail(Essay_data, URL, article_id, Article_list)

        # Get the authors' information
        Year = re.findall(r'<b>(\d+)</b>', str(Essay_data))[0]
        for author_link in author_links(Essay_data):
            driver.get(author_link)
            Author_detail = source(driver, Max_retryTimes)
            Author_list = author_detail(Author_detail, Year, article_id, Author_list)


        print('Complete: ', URL)
        driver.quit()

    else:
        print('Wrong: Some error occurred: ', URL)
        pass


# Options setting
options = Options()
options.add_argument('--headless')  # Run Edge in headless mode
options.add_argument('disable-gpu')  # Disable GPU acceleration
options.add_argument('pageLoadStrategy=none')  # Set page load strategy to 'none'