ImageOptimizer/crawler.py
2025-06-09 16:47:55 +08:00

199 lines
7.6 KiB
Python

import os
import requests
import time
import customtkinter as ctk
from tkinter import messagebox, StringVar
# Unsplash API
api_url = "https://api.unsplash.com/search/photos"
default_access_key = ['-7mzLi8x261zMRKU-Rw_Qz_vIvaYvf2sWHV1l1pO5ZE',
'y0TRzwdfxA4PpZBPw3Lr8a6bYyAugiKItS7b_80UTA4']
# Crawl parameter
categories = {
'animals': 'animals',
'people': 'people',
'food-drink': 'food drink',
'nature': 'nature',
'architecture-interior': 'architecture interior'
}
# Save directory and changelog file
os.makedirs('dataset', exist_ok=True)
changelog_path = os.path.join('dataset', 'changelog.txt')
# Initialize page and images_downloaded
category_state = {category: {'page': 1, 'images_downloaded': 0} for category in categories}
# Check if changelog file exists and read the state if it does
if os.path.exists(changelog_path):
with open(changelog_path, 'r') as changelog_file:
for line in changelog_file:
category, page, images_downloaded = line.strip().split(',')
category_state[category]['page'] = int(page)
category_state[category]['images_downloaded'] = int(images_downloaded)
# Create changelog file
else:
with open(changelog_path, 'w') as changelog_file:
for category in categories:
changelog_file.write(f'{category},1,0\n')
# Create directories for categories if they don't exist
for category in categories:
os.makedirs(os.path.join('dataset', category), exist_ok=True)
# Crawling
def crawl_images(category, query, download_num, access_key):
if download_num == 0:
print("No need to update: " + category)
return
global category_state
img_index = category_state[category]['images_downloaded']
cur_page = category_state[category]['page']
flag = 0
if img_index % 10 != 0:
cur_page -= 1
real_img_index = int((cur_page - 1) * 10)
try:
while download_num > 0:
params = {
'query': query,
'client_id': access_key,
'per_page': 50,
'page': cur_page
}
response = requests.get(api_url, params=params)
if response.status_code == 200:
data = response.json()
images = data['results']
print(f'Found {len(images)} images for category {category} on page {cur_page}')
for idx, img in enumerate(images):
if download_num <= 0:
break
img_url = img['urls']['regular']
# Ignore crawled images
if flag == 0 and img_index > real_img_index:
real_img_index += 1
print(img_index)
print(real_img_index)
continue
else:
flag = 1
# Crawls and Store images
try:
img_data = requests.get(img_url).content
img_filename = os.path.join('dataset', category, f'{category}_{img_index + 1}.jpg')
with open(img_filename, 'wb') as handler:
handler.write(img_data)
print(f'Downloaded {img_filename}')
img_index += 1
download_num -= 1
except Exception as e:
messagebox.showerror("Download Error", f'Failed to download {img_url}: {e}')
break
cur_page += 1
time.sleep(1)
else:
messagebox.showerror("Download Error",
f'Failed to fetch images for category {category}: {response.status_code}. '
f'It can be caused by the API limit. Please try again after 1hr.')
break
finally:
# Update changelog file after finishing each category
category_state[category]['page'] = cur_page
category_state[category]['images_downloaded'] = img_index
with open(changelog_path, 'w') as new_changelog_file:
for cat in categories:
new_changelog_file.write(
f'{cat},{category_state[cat]["page"]},{category_state[cat]["images_downloaded"]}\n')
print("Completed: " + category)
def start_crawling():
start_button.configure(text="Processing", fg_color="red")
start_button.update()
access_key = access_key_combo.get()
try:
total_images_to_download = {
'animals': int(animals_var.get()),
'people': int(people_var.get()),
'food-drink': int(food_drink_var.get()),
'nature': int(nature_var.get()),
'architecture-interior': int(architecture_interior_var.get())
}
# Validate input values
total_count = sum(total_images_to_download.values())
if total_count <= 0:
messagebox.showerror("Invalid input", "Total number of images to download must be greater than 0.")
return
if total_count > 500:
messagebox.showerror("Invalid input", "The total number of images to download must not exceed 500.")
return
# Start Crawling
for curr_category, curr_query in categories.items():
print(curr_category)
crawl_images(curr_category, curr_query, total_images_to_download[curr_category], access_key)
except ValueError:
messagebox.showerror("Invalid input", "Please enter valid integer values.")
return
finally:
start_button.configure(text="Start Crawling", fg_color="#3b8ed0")
start_button.update()
if __name__ == "__main__":
# Create the main window
root = ctk.CTk()
root.title("Image Crawler")
root.iconbitmap('./icon/icon.ico')
# Create StringVar instances for each entry
animals_var = StringVar(value="0")
people_var = StringVar(value="0")
food_drink_var = StringVar(value="0")
nature_var = StringVar(value="0")
architecture_interior_var = StringVar(value="0")
# Create and place widgets
ctk.CTkLabel(root, text="Unsplash API Access Key:").grid(row=0, column=0, padx=10, pady=5)
access_key_combo = ctk.CTkComboBox(root, values=default_access_key, width=200)
access_key_combo.grid(row=0, column=1, padx=10, pady=5)
ctk.CTkLabel(root, text="Animals:").grid(row=1, column=0, padx=10, pady=5)
animals_entry = ctk.CTkEntry(root, textvariable=animals_var, width=100)
animals_entry.grid(row=1, column=1, padx=10, pady=5)
ctk.CTkLabel(root, text="People:").grid(row=2, column=0, padx=10, pady=5)
people_entry = ctk.CTkEntry(root, textvariable=people_var, width=100)
people_entry.grid(row=2, column=1, padx=10, pady=5)
ctk.CTkLabel(root, text="Food & Drink:").grid(row=3, column=0, padx=10, pady=5)
food_drink_entry = ctk.CTkEntry(root, textvariable=food_drink_var, width=100)
food_drink_entry.grid(row=3, column=1, padx=10, pady=5)
ctk.CTkLabel(root, text="Nature:").grid(row=4, column=0, padx=10, pady=5)
nature_entry = ctk.CTkEntry(root, textvariable=nature_var, width=100)
nature_entry.grid(row=4, column=1, padx=10, pady=5)
ctk.CTkLabel(root, text="Architecture & Interior:").grid(row=5, column=0, padx=10, pady=5)
architecture_interior_entry = ctk.CTkEntry(root, textvariable=architecture_interior_var, width=100)
architecture_interior_entry.grid(row=5, column=1, padx=10, pady=5)
start_button = ctk.CTkButton(root, text="Start Crawling", command=start_crawling)
start_button.grid(row=6, column=0, columnspan=2, padx=10, pady=20)
# Start Window
root.mainloop()