Change the data structure
This commit is contained in:
parent
e504e73409
commit
27707a058c
@ -7,7 +7,7 @@ import ejde_save
|
|||||||
from retrying import retry
|
from retrying import retry
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||||
|
|
||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
@ -184,7 +184,7 @@ def process_article(title, article_url):
|
|||||||
msc = msc_match.group(1).strip().strip('.').strip()
|
msc = msc_match.group(1).strip().strip('.').strip()
|
||||||
msc = re.split(r', |;', msc)
|
msc = re.split(r', |;', msc)
|
||||||
else:
|
else:
|
||||||
msc = None
|
msc = []
|
||||||
|
|
||||||
# Extract KeyWords
|
# Extract KeyWords
|
||||||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||||||
@ -195,7 +195,7 @@ def process_article(title, article_url):
|
|||||||
keywords = re.split(r', |;', keywords)
|
keywords = re.split(r', |;', keywords)
|
||||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
||||||
else:
|
else:
|
||||||
keywords = None
|
keywords = []
|
||||||
|
|
||||||
# Extract DOI
|
# Extract DOI
|
||||||
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
||||||
@ -220,6 +220,7 @@ def process_article(title, article_url):
|
|||||||
# Data processing
|
# Data processing
|
||||||
authors.append(cell[0])
|
authors.append(cell[0])
|
||||||
name = cell[0].split(" ")
|
name = cell[0].split(" ")
|
||||||
|
middle_name = ''.join(name[1:-1]) if name[1:-1] else None
|
||||||
affiliation = ', '.join(cell[1:-1])
|
affiliation = ', '.join(cell[1:-1])
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
||||||
@ -227,10 +228,10 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": article_id,
|
"from_article": [article_id],
|
||||||
"firstname": name[0],
|
"first_name": name[0],
|
||||||
"lastname": name[-1],
|
"last_name": name[-1],
|
||||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
|
"middle_name": middle_name,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
@ -256,6 +257,7 @@ def process_article(title, article_url):
|
|||||||
# Data processing
|
# Data processing
|
||||||
authors.append(match[0])
|
authors.append(match[0])
|
||||||
name = match[0].split(" ")
|
name = match[0].split(" ")
|
||||||
|
middle_name = ''.join(name[1:-1]) if name[1:-1] else None
|
||||||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
||||||
@ -263,10 +265,10 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": article_id,
|
"from_article": [article_id],
|
||||||
"firstname": name[0],
|
"first_name": name[0],
|
||||||
"lastname": name[-1],
|
"last_name": name[-1],
|
||||||
"middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
|
"middle_name": middle_name,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": affiliation,
|
||||||
@ -345,6 +347,8 @@ for future in as_completed(futures):
|
|||||||
except Exception as vol_err:
|
except Exception as vol_err:
|
||||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||||
|
|
||||||
|
wait(futures)
|
||||||
|
|
||||||
# Retry failed processing paper
|
# Retry failed processing paper
|
||||||
print("START RETRYING:", len(failedData))
|
print("START RETRYING:", len(failedData))
|
||||||
while failedData:
|
while failedData:
|
||||||
@ -390,4 +394,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
|||||||
|
|
||||||
# Transfer to large file and delete the temporary storage files
|
# Transfer to large file and delete the temporary storage files
|
||||||
ejde_save.Transf()
|
ejde_save.Transf()
|
||||||
# ejde_save.delete()
|
ejde_save.delete()
|
||||||
|
|||||||
@ -63,10 +63,10 @@ def author_detail(Data, Year, article_id, Author_list):
|
|||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": article_id,
|
"from_article": [article_id],
|
||||||
"firstname": Firstname,
|
"first_name": Firstname,
|
||||||
"lastname": Lastname,
|
"last_name": Lastname,
|
||||||
"middlename": Middlename,
|
"middle_name": Middlename,
|
||||||
"affiliation": [
|
"affiliation": [
|
||||||
{
|
{
|
||||||
"year": Year,
|
"year": Year,
|
||||||
@ -94,24 +94,18 @@ def article_detail(Data, URL, article_id, Aricle_list):
|
|||||||
del Author[-1]
|
del Author[-1]
|
||||||
|
|
||||||
# Submit_datetime and publish_datetime
|
# Submit_datetime and publish_datetime
|
||||||
def timeSet(time):
|
|
||||||
time = time.split('-')
|
|
||||||
time[1] = time[1].strip('0')
|
|
||||||
time = time[0] + '-' + time[1] + '-' + time[2]
|
|
||||||
return time
|
|
||||||
|
|
||||||
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
||||||
time = re.findall(r'\d+-\d+-\d+', str(time))
|
time = re.findall(r'\d+-\d+-\d+', str(time))
|
||||||
Submit_date = timeSet(time[0]) if time[0] else None
|
Submit_date = time[0] if time[0] else None
|
||||||
Publish_date = timeSet(time[1]) if time[1] else None
|
Publish_date = time[1] if time[1] else None
|
||||||
|
|
||||||
# Keyword
|
# Keyword
|
||||||
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
||||||
Keyword = Keyword.split(', ') if Keyword is not None else None
|
Keyword = Keyword.split(', ') if Keyword is not None else []
|
||||||
|
|
||||||
# MSC
|
# MSC
|
||||||
MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
|
MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
|
||||||
MSC = MSC.split(', ') if MSC is not None else None
|
MSC = MSC.split(', ') if MSC is not None else []
|
||||||
|
|
||||||
# DOI
|
# DOI
|
||||||
if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
|
if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import uuid
|
import uuid
|
||||||
import calendar
|
import calendar
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
# ==========获取细节==========
|
# ==========获取细节==========
|
||||||
@ -36,9 +37,9 @@ def Author_dict(soup, article_id, Author_list):
|
|||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": article_id,
|
"from_article": article_id,
|
||||||
"firstname": Firstname,
|
"first _name": Firstname,
|
||||||
"lastname": Lastname,
|
"last_name": Lastname,
|
||||||
"middlename": Middlename,
|
"middle_name": Middlename,
|
||||||
"affiliation": [
|
"affiliation": [
|
||||||
{
|
{
|
||||||
"year": Year,
|
"year": Year,
|
||||||
@ -87,10 +88,8 @@ def Article_dict(soup, url, article_id):
|
|||||||
Time = []
|
Time = []
|
||||||
|
|
||||||
def timeSet(time):
|
def timeSet(time):
|
||||||
time = time.split(' ')
|
input_date = datetime.strptime(time, "%d %B %Y")
|
||||||
time[1] = str(list(calendar.month_name).index(time[1]))
|
return input_date.strftime("%Y-%m-%d")
|
||||||
time = time[2] + '-' + time[1] + '-' + time[0]
|
|
||||||
return time
|
|
||||||
|
|
||||||
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
time_list = info.find('ul', class_='c-bibliographic-information__list')
|
||||||
times = time_list.find_all('time')
|
times = time_list.find_all('time')
|
||||||
@ -112,7 +111,7 @@ def Article_dict(soup, url, article_id):
|
|||||||
Keyword.append(keyword)
|
Keyword.append(keyword)
|
||||||
|
|
||||||
# MSC
|
# MSC
|
||||||
MSC = None # SpringerOpen.com does not have MSC
|
MSC = [] # SpringerOpen.com does not have MSC
|
||||||
|
|
||||||
# DOI
|
# DOI
|
||||||
DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
|
DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user