deleted unnecessary retrying commands
This commit is contained in:
parent
e49e829682
commit
e9bdb9cdff
@ -4,9 +4,8 @@ import re
|
|||||||
import ejde_save
|
import ejde_save
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
|
||||||
from retrying import retry
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
@ -24,7 +23,6 @@ def datetime_transform(date):
|
|||||||
|
|
||||||
|
|
||||||
# Article and author detail
|
# Article and author detail
|
||||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
|
||||||
def process_article(url):
|
def process_article(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@ -61,13 +59,19 @@ def process_article(url):
|
|||||||
|
|
||||||
# Extract submission date
|
# Extract submission date
|
||||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
submitted_date = match.group(1) if match else None
|
submitted_date = match.group(1)
|
||||||
submitted_date = datetime_transform(submitted_date)
|
if match:
|
||||||
|
submitted_date = datetime_transform(submitted_date)
|
||||||
|
else:
|
||||||
|
submitted_date = None
|
||||||
|
|
||||||
# Extract publication date
|
# Extract publication date
|
||||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
publish_date = match.group(1) if match else None
|
publish_date = match.group(1)
|
||||||
publish_date = datetime_transform(publish_date)
|
if match:
|
||||||
|
publish_date = datetime_transform(publish_date)
|
||||||
|
else:
|
||||||
|
publish_date = None
|
||||||
|
|
||||||
# Extract MSC
|
# Extract MSC
|
||||||
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||||||
@ -182,8 +186,6 @@ for future in as_completed(futures):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("An error occurred:", str(e))
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
wait(futures)
|
|
||||||
|
|
||||||
# Save remaining data
|
# Save remaining data
|
||||||
if len(articleData) > 0:
|
if len(articleData) > 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
|
|||||||
@ -93,5 +93,6 @@ def delete():
|
|||||||
file_path = os.path.join(folder_path, file_name)
|
file_path = os.path.join(folder_path, file_name)
|
||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
|
os.rmdir(folder_path)
|
||||||
|
|
||||||
print('\nAttention: The temporary storage files have been deleted!')
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user