ImageOptimizer/dataset_analysis.py
2025-06-09 16:47:55 +08:00

75 lines
2.6 KiB
Python

import pandas as pd
import json
def dataset_desc(file_path):
# Stores a list of JSON objects
json_objects = []
# A string that builds a single JSON object
current_json_str = ""
# Split different image data in json file
with open(file_path, 'r') as file:
for line in file:
# Removes whitespace at the beginning and end of a line
line = line.strip()
# If the line begins with '{' , it means that a new JSON object starts
if line.startswith("{"):
current_json_str = line
# If this line ends with '}' , it means that a JSON object ends
elif line.endswith("}"):
current_json_str += line
try:
# Parsing JSON strings
json_obj = json.loads(current_json_str)
json_objects.append(json_obj)
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
current_json_str = ""
# If it is part of a JSON object, it is added to the current JSON string
else:
current_json_str += line
# Convert a list of JSON objects to pandas DataFrame
df = pd.DataFrame(json_objects)
# Filter out numeric columns
numeric_df = df.select_dtypes(include='number')
# Create a list to collect data for all numeric columns
boxplot_data = []
# Create an empty DataFrame to store statistics
stats_df = pd.DataFrame(columns=[
'Parameter', 'Min', 'Max', 'Mean', 'Median', 'Mode', 'Range',
'Q1', 'Q3', 'IQR', 'Variance', 'Standard Deviation', 'Skewness'
])
# Generate statistics and draw graphs
for column in numeric_df.columns:
data = numeric_df[column].dropna()
boxplot_data.append(numeric_df[column].dropna())
# Computational statistics
stats = {
'Parameter': column,
'Min': data.min(),
'Max': data.max(),
'Mean': data.mean(),
'Median': data.median(),
'Mode': data.mode().values[0] if not data.mode().empty else float('nan'),
'Range': data.max() - data.min(),
'Q1': data.quantile(0.25),
'Q3': data.quantile(0.75),
'IQR': data.quantile(0.75) - data.quantile(0.25),
'Variance': data.var(),
'Standard Deviation': data.std(),
'Skewness': data.skew()
}
# Append the statistics to the stats DataFrame using concat
stats_df = pd.concat([stats_df, pd.DataFrame([stats])], ignore_index=True)
return stats_df, df