From 040de4c25b8fe6a9f9d07d31fff71515a18b4b3b Mon Sep 17 00:00:00 2001 From: Creylay Date: Mon, 20 Apr 2026 16:16:17 -0400 Subject: [PATCH 001/361] feat: integrate image classification components and dataloaders - Added ImageDataLoader for loading image datasets from zip files. - Implemented MLPImageClassifier for image classification using a multi-layer perceptron. - Created ImageClassificationTask to handle image classification tasks. - Updated DatasetModal to skip preview for ImageDataLoader. - Added tests for MLPImageClassifier and image classification pipeline. --- DashAI/back/api/api_v1/endpoints/datasets.py | 34 ++ .../dataloaders/classes/dashai_dataset.py | 22 +- .../dataloaders/classes/image_dataloader.py | 116 +++++++ DashAI/back/initial_components.py | 10 +- DashAI/back/models/mlp_image_classifier.py | 307 ++++++++++++++++++ .../back/tasks/image_classification_task.py | 90 +++++ .../src/components/datasets/DatasetModal.jsx | 16 +- .../back/models/test_mlp_image_classifier.py | 149 +++++++++ 8 files changed, 736 insertions(+), 8 deletions(-) create mode 100644 DashAI/back/dataloaders/classes/image_dataloader.py create mode 100644 DashAI/back/models/mlp_image_classifier.py create mode 100644 DashAI/back/tasks/image_classification_task.py create mode 100644 tests/back/models/test_mlp_image_classifier.py diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 5ea732ac6..55d943a06 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -1638,6 +1638,15 @@ async def preview_with_types( with zipfile.ZipFile(tmp_file_path, "r") as zf: zf.extractall(extract_dir) + image_extensions = { + ".png", + ".jpg", + ".jpeg", + ".bmp", + ".gif", + ".tiff", + ".webp", + } supported_map = { ".csv": "CSVDataLoader", ".json": "JSONDataLoader", @@ -1646,6 +1655,7 @@ async def preview_with_types( } dataloader_name = None matched_file = None + has_images = False for root, _, files in os.walk(extract_dir): for f in files: ext = os.path.splitext(f)[1].lower() @@ -1653,9 +1663,33 @@ async def preview_with_types( dataloader_name = supported_map[ext] matched_file = os.path.join(root, f) break + if ext in image_extensions: + has_images = True if dataloader_name: break + if dataloader_name is None and has_images: + shutil.rmtree(extract_dir, ignore_errors=True) + os.unlink(tmp_file_path) + return { + "sample": [], + "schema": { + "image": {"type": "Image", "dtype": "string"}, + "label": { + "type": "Categorical", + "dtype": "string", + }, + }, + "inferred_types": { + "image": {"type": "Image", "dtype": "string"}, + "label": { + "type": "Categorical", + "dtype": "string", + }, + }, + "preview_row_count": 0, + } + if dataloader_name is None: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, diff --git a/DashAI/back/dataloaders/classes/dashai_dataset.py b/DashAI/back/dataloaders/classes/dashai_dataset.py index ccb69c4a6..571b7f3ed 100644 --- a/DashAI/back/dataloaders/classes/dashai_dataset.py +++ b/DashAI/back/dataloaders/classes/dashai_dataset.py @@ -240,11 +240,23 @@ def _compute_general_info(self, dataset_df) -> dict: dict General information including rows, columns, memory usage, and dtypes. """ + from DashAI.back.types.dashai_image import DashAIImage + + hashable_cols = [ + c + for c in dataset_df.columns + if not isinstance(self.types.get(c), DashAIImage) + ] + if hashable_cols: + duplicate_rows = int(dataset_df[hashable_cols].duplicated().sum()) + else: + duplicate_rows = 0 + return { "n_rows": len(dataset_df), "n_columns": len(dataset_df.columns), "memory_usage_mb": float(dataset_df.memory_usage(deep=True).sum() / 1e6), - "duplicate_rows": int(dataset_df.duplicated().sum()), + "duplicate_rows": duplicate_rows, "dtypes": {k: v.to_string().get("type") for k, v in self.types.items()}, } @@ -757,8 +769,12 @@ def transform_dataset_with_schema( dai_table[column_name] = base_col # Use the dtype from schema for pa_type pa_type = to_arrow_types(dtype) - # DashAIImage is currently not fully implemented - # This step should be formalized after solving that. + elif _type == "Image": + from DashAI.back.types.dashai_image import DashAIImage + + dashai_types[column_name] = DashAIImage(dtype=dtype) + dai_table[column_name] = table.column(column_name) + pa_type = table.schema.field(column_name).type else: if _type in ["Date", "Time", "Timestamp"]: # Since DashAI is not using date, time or timestamp types for its models diff --git a/DashAI/back/dataloaders/classes/image_dataloader.py b/DashAI/back/dataloaders/classes/image_dataloader.py new file mode 100644 index 000000000..beb229f60 --- /dev/null +++ b/DashAI/back/dataloaders/classes/image_dataloader.py @@ -0,0 +1,116 @@ +"""DashAI Image Dataloader.""" + +import shutil +from typing import Any, Dict + +from beartype import beartype +from datasets import Dataset, IterableDatasetDict, load_dataset + +from DashAI.back.core.schema_fields import none_type, schema_field, string_field +from DashAI.back.core.schema_fields.base_schema import BaseSchema +from DashAI.back.core.utils import MultilingualString +from DashAI.back.dataloaders.classes.dashai_dataset import ( + DashAIDataset, + to_dashai_dataset, +) +from DashAI.back.dataloaders.classes.dataloader import BaseDataLoader + + +class ImageDataLoaderSchema(BaseSchema): + name: schema_field( + none_type(string_field()), + "", + ( + "Custom name to register your dataset. If no name is specified, " + "the name of the uploaded file will be used." + ), + ) # type: ignore + + +class ImageDataLoader(BaseDataLoader): + """Data loader for image datasets. + + Expects a zip file containing images organized in subdirectories by class + label (imagefolder format). + """ + + COMPATIBLE_COMPONENTS = ["ImageClassificationTask"] + SCHEMA = ImageDataLoaderSchema + + DESCRIPTION: str = MultilingualString( + en=( + "Data loader for image datasets. Upload a ZIP file containing " + "images organized in subdirectories by class label " + "(imagefolder format)." + ), + es=( + "Cargador de datos para datasets de imágenes. Suba un archivo " + "ZIP con imágenes organizadas en subdirectorios por etiqueta " + "de clase (formato imagefolder)." + ), + ) + DISPLAY_NAME: str = MultilingualString( + en="Image Data Loader", + es="Cargador de Datos de Imágenes", + ) + + @beartype + def load_data( + self, + filepath_or_buffer: str, + temp_path: str, + params: Dict[str, Any], + n_sample: int | None = None, + ) -> DashAIDataset: + """Load an image dataset. + + Parameters + ---------- + filepath_or_buffer : str + An URL where the dataset is located or a FastAPI/Uvicorn uploaded + file object. + temp_path : str + The temporary path where the files will be extracted and then + uploaded. + params : Dict[str, Any] + Dict with the dataloader parameters. + n_sample : int | None + Indicates how many rows to load from the dataset, all rows if None. + + Returns + ------- + DashAIDataset + A DashAI Dataset with the loaded image data. + """ + import io + + prepared_path = self.prepare_files(filepath_or_buffer, temp_path) + + if prepared_path[1] != "dir": + raise ValueError( + "The image dataloader requires the input file to be a zip file." + ) + + dataset = load_dataset( + "imagefolder", + data_dir=prepared_path[0], + streaming=bool(n_sample), + cache_dir=temp_path, + ) + + if n_sample: + if isinstance(dataset, IterableDatasetDict): + dataset = dataset["train"] + dataset = Dataset.from_list(list(dataset.take(n_sample))) + + def convert_image_to_bytes(example): + buffer = io.BytesIO() + img_format = example["image"].format or "PNG" + example["image"].save(buffer, format=img_format) + return {"image": {"bytes": buffer.getvalue(), "format": img_format}} + + dataset = dataset.map(convert_image_to_bytes) + + shutil.rmtree(prepared_path[0]) + + return to_dashai_dataset(dataset) diff --git a/DashAI/back/initial_components.py b/DashAI/back/initial_components.py index 0a01fb093..8ddfcfdd6 100644 --- a/DashAI/back/initial_components.py +++ b/DashAI/back/initial_components.py @@ -64,6 +64,7 @@ # DataLoaders from DashAI.back.dataloaders.classes.csv_dataloader import CSVDataLoader from DashAI.back.dataloaders.classes.excel_dataloader import ExcelDataLoader +from DashAI.back.dataloaders.classes.image_dataloader import ImageDataLoader from DashAI.back.dataloaders.classes.json_dataloader import JSONDataLoader # Explainers @@ -121,8 +122,6 @@ from DashAI.back.metrics.translation.bleu import Bleu from DashAI.back.metrics.translation.chrf import Chrf from DashAI.back.metrics.translation.ter import Ter - -# Models from DashAI.back.models.hugging_face.deberta_v3_transformer import DebertaV3Transformer from DashAI.back.models.hugging_face.distilbert_transformer import DistilBertTransformer from DashAI.back.models.hugging_face.llama_model import LlamaModel @@ -165,6 +164,9 @@ StableDiffusionXLModel, ) from DashAI.back.models.hugging_face.tongyi_z_image_model import TongyiZImageModel + +# Models +from DashAI.back.models.mlp_image_classifier import MLPImageClassifier from DashAI.back.models.scikit_learn.bow_text_classification_model import ( BagOfWordsTextClassificationModel, ) @@ -208,6 +210,7 @@ # Tasks from DashAI.back.tasks.controlnet_task import ControlNetTask +from DashAI.back.tasks.image_classification_task import ImageClassificationTask from DashAI.back.tasks.regression_task import RegressionTask from DashAI.back.tasks.tabular_classification_task import TabularClassificationTask from DashAI.back.tasks.text_classification_task import TextClassificationTask @@ -239,6 +242,7 @@ def get_initial_components(): TextToImageGenerationTask, TextToTextGenerationTask, ControlNetTask, + ImageClassificationTask, # Models SVC, DecisionTreeClassifier, @@ -276,10 +280,12 @@ def get_initial_components(): RidgeRegression, LinearSVR, LinearRegression, + MLPImageClassifier, # Dataloaders CSVDataLoader, JSONDataLoader, ExcelDataLoader, + ImageDataLoader, # Metrics F1, Accuracy, diff --git a/DashAI/back/models/mlp_image_classifier.py b/DashAI/back/models/mlp_image_classifier.py new file mode 100644 index 000000000..3e12427d8 --- /dev/null +++ b/DashAI/back/models/mlp_image_classifier.py @@ -0,0 +1,307 @@ +"""MLP-based image classifier for DashAI.""" + +import datasets +import torch +import torch.nn as nn +import torch.optim as optim +import torch.utils.data +from torchvision import transforms + +from DashAI.back.core.schema_fields import ( + BaseSchema, + float_field, + int_field, + list_field, + schema_field, +) +from DashAI.back.core.utils import MultilingualString +from DashAI.back.models.base_model import BaseModel + + +class MLPImageClassifierSchema(BaseSchema): + """Configuration parameters for the MLP Image Classifier.""" + + epochs: schema_field( + int_field(ge=1), + placeholder=10, + description=MultilingualString( + en=( + "The number of epochs to train the model. An epoch is a full " + "iteration over the training data." + ), + es=( + "El número de épocas para entrenar el modelo. Una época es una " + "iteración completa sobre los datos de entrenamiento." + ), + ), + alias=MultilingualString(en="Epochs", es="Épocas"), + ) # type: ignore + + learning_rate: schema_field( + float_field(gt=0.0), + placeholder=0.001, + description=MultilingualString( + en="Learning rate for the Adam optimizer.", + es="Tasa de aprendizaje para el optimizador Adam.", + ), + alias=MultilingualString(en="Learning rate", es="Tasa de aprendizaje"), + ) # type: ignore + + hidden_dims: schema_field( + list_field(int_field(ge=1), min_items=1), + placeholder=[128, 64], + description=MultilingualString( + en=( + "The hidden layers and their dimensions. Specify the number " + "of units of each layer separated by commas." + ), + es=( + "Las capas ocultas y sus dimensiones. Especifique el número " + "de unidades de cada capa separadas por comas." + ), + ), + alias=MultilingualString( + en="Hidden layer dimensions", + es="Dimensiones de capas ocultas", + ), + ) # type: ignore + + +class _ImageDataset(torch.utils.data.Dataset): + """Torch Dataset wrapper for HuggingFace image datasets.""" + + def __init__(self, hf_dataset: datasets.Dataset, has_labels: bool = True): + self.dataset = hf_dataset + self.has_labels = has_labels + self.transforms = transforms.Compose( + [ + transforms.Resize((30, 30)), + transforms.ToTensor(), + ] + ) + + column_names = list(self.dataset.features.keys()) + self.image_col_name = column_names[0] + self.label_col_name = ( + column_names[1] if has_labels and len(column_names) > 1 else None + ) + + self.tensor_shape = self.transforms(self.dataset[0][self.image_col_name]).shape + + def num_classes(self): + if self.label_col_name is None: + return 0 + return len(set(self.dataset[self.label_col_name])) + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + image = self.transforms(self.dataset[idx][self.image_col_name]) + if self.label_col_name is None: + return image + label = self.dataset[idx][self.label_col_name] + return image, label + + +class _MLP(nn.Module): + """Multi-Layer Perceptron for image classification.""" + + def __init__(self, input_dim, output_dim, hidden_dims): + super().__init__() + self.hidden_layers = nn.ModuleList() + previous_dim = input_dim + + for hidden_dim in hidden_dims: + self.hidden_layers.append(nn.Linear(previous_dim, hidden_dim)) + previous_dim = hidden_dim + + self.output_layer = nn.Linear(previous_dim, output_dim) + self.relu = nn.ReLU() + + def forward(self, x: torch.Tensor): + batch_size = x.shape[0] + x = x.view(batch_size, -1) + + for layer in self.hidden_layers: + x = self.relu(layer(x)) + + return self.output_layer(x) + + +class MLPImageClassifier(BaseModel): + """MLP-based image classifier. + + A feed-forward neural network that flattens image pixels and passes them + through configurable hidden layers with ReLU activation for classification. + """ + + SCHEMA = MLPImageClassifierSchema + COMPATIBLE_COMPONENTS = ["ImageClassificationTask"] + DISPLAY_NAME: str = MultilingualString( + en="MLP Image Classifier", + es="Clasificador de Imágenes MLP", + ) + DESCRIPTION: str = MultilingualString( + en=( + "A Multi-Layer Perceptron (MLP) image classifier that flattens " + "image pixels and passes them through configurable fully-connected " + "hidden layers with ReLU activation for classification." + ), + es=( + "Un clasificador de imágenes basado en Perceptrón Multicapa (MLP) " + "que aplana los píxeles de la imagen y los pasa por capas ocultas " + "completamente conectadas con activación ReLU para clasificación." + ), + ) + COLOR: str = "#E91E63" + ICON: str = "Image" + + def __init__(self, epochs=10, learning_rate=0.001, hidden_dims=None, **kwargs): + if hidden_dims is None: + hidden_dims = [128, 64] + self.epochs = epochs + self.learning_rate = learning_rate + self.hidden_dims = hidden_dims + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = None + self.optimizer = None + self.input_dim = None + self.output_dim = None + + def train(self, x_train, y_train, x_validation=None, y_validation=None): + """Train the MLP on the provided image dataset. + + Parameters + ---------- + x_train : DashAIDataset + Input dataset containing images. + y_train : DashAIDataset + Target dataset containing labels. + x_validation : DashAIDataset, optional + Validation input features (unused). Defaults to None. + y_validation : DashAIDataset, optional + Validation target labels (unused). Defaults to None. + + Returns + ------- + MLPImageClassifier + The trained model instance. + """ + image_col = list(x_train.features.keys())[0] + label_col = list(y_train.features.keys())[0] + + hf_dataset = datasets.Dataset.from_dict( + { + "image": x_train[image_col], + "label": y_train[label_col], + } + ) + image_dataset = _ImageDataset(hf_dataset, has_labels=True) + + self.input_dim = ( + image_dataset.tensor_shape[0] + * image_dataset.tensor_shape[1] + * image_dataset.tensor_shape[2] + ) + self.output_dim = image_dataset.num_classes() + + train_loader = torch.utils.data.DataLoader( + image_dataset, batch_size=32, shuffle=True + ) + + self.model = _MLP(self.input_dim, self.output_dim, self.hidden_dims).to( + self.device + ) + criterion = nn.CrossEntropyLoss() + self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) + + self.model.train() + for _ in range(self.epochs): + for images, labels in train_loader: + images, labels = images.to(self.device), labels.to(self.device) + self.optimizer.zero_grad() + outputs = self.model(images) + loss = criterion(outputs, labels) + loss.backward() + self.optimizer.step() + + return self + + def predict(self, x): + """Make predictions on the input dataset. + + Parameters + ---------- + x : DashAIDataset + Input dataset containing images. + + Returns + ------- + list + List of predicted probabilities for each class. + """ + image_col = list(x.features.keys())[0] + hf_dataset = datasets.Dataset.from_dict({"image": x[image_col]}) + image_dataset = _ImageDataset(hf_dataset, has_labels=False) + test_loader = torch.utils.data.DataLoader( + image_dataset, batch_size=32, shuffle=False + ) + + self.model.eval() + probs_predicted = [] + with torch.no_grad(): + for images in test_loader: + images = images.to(self.device) + output_probs = self.model(images) + probs_predicted += output_probs.tolist() + return probs_predicted + + def save(self, filename: str) -> None: + """Save the model checkpoint to disk. + + Parameters + ---------- + filename : str + Path where the checkpoint will be saved. + """ + checkpoint = { + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.optimizer.state_dict(), + "epochs": self.epochs, + "learning_rate": self.learning_rate, + "hidden_dims": self.hidden_dims, + "input_dim": self.input_dim, + "output_dim": self.output_dim, + } + torch.save(checkpoint, filename) + + @classmethod + def load(cls, filename: str): + """Load a model checkpoint from disk. + + Parameters + ---------- + filename : str + Path to the checkpoint file. + + Returns + ------- + MLPImageClassifier + Instance with loaded weights. + """ + checkpoint = torch.load(filename, map_location=torch.device("cpu")) + instance = cls( + epochs=checkpoint["epochs"], + learning_rate=checkpoint["learning_rate"], + hidden_dims=checkpoint["hidden_dims"], + ) + instance.input_dim = checkpoint["input_dim"] + instance.output_dim = checkpoint["output_dim"] + instance.model = _MLP( + instance.input_dim, instance.output_dim, instance.hidden_dims + ) + instance.model.load_state_dict(checkpoint["model_state_dict"]) + instance.optimizer = optim.Adam(instance.model.parameters()) + instance.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + return instance diff --git a/DashAI/back/tasks/image_classification_task.py b/DashAI/back/tasks/image_classification_task.py new file mode 100644 index 000000000..4601886f4 --- /dev/null +++ b/DashAI/back/tasks/image_classification_task.py @@ -0,0 +1,90 @@ +from typing import TYPE_CHECKING, List, Union + +from DashAI.back.core.utils import MultilingualString +from DashAI.back.tasks.classification_task import ClassificationTask +from DashAI.back.types.categorical import Categorical +from DashAI.back.types.dashai_image import DashAIImage + +if TYPE_CHECKING: + from datasets import DatasetDict + + from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset + + +class ImageClassificationTask(ClassificationTask): + """Task for classifying images into discrete categories. + + Image classification predicts categorical labels from image inputs. + It accepts image columns as inputs, requires a single categorical + output column, and is compatible with image classifier models. + """ + + DESCRIPTION: str = MultilingualString( + en=( + "Image classification in machine learning involves predicting " + "categorical labels for image data. Models are trained to learn " + "visual patterns and features in images, enabling accurate " + "classification of new instances." + ), + es=( + "La clasificación de imágenes en el aprendizaje automático implica " + "predecir etiquetas categóricas para datos de imágenes. Los modelos " + "se entrenan para aprender patrones visuales y características en " + "las imágenes, lo que permite una clasificación precisa de nuevas " + "instancias." + ), + ) + DISPLAY_NAME: str = MultilingualString( + en="Image Classification", es="Clasificación de Imágenes" + ) + SCORING_PROFILES = { + "balanced": { + "description": "Balanced", + "weights": {"Accuracy": 0.3, "F1": 0.4, "ROCAUC": 0.3}, + }, + "detectPositives": { + "description": "Detect Positives", + "weights": {"Recall": 0.6, "F1": 0.3, "Precision": 0.1}, + }, + "avoidFalseAlarms": { + "description": "Avoid False Alarms", + "weights": {"Precision": 0.6, "F1": 0.3, "Recall": 0.1}, + }, + "probabilityQuality": { + "description": "Probability Quality", + "weights": {"ROCAUC": 0.5, "LogLoss": 0.5}, + }, + } + metadata: dict = { + "inputs_types": [DashAIImage], + "outputs_types": [Categorical], + "inputs_cardinality": 1, + "outputs_cardinality": 1, + } + + def prepare_for_task( + self, + dataset: Union["DatasetDict", "DashAIDataset"], + input_columns: List[str], + output_columns: List[str], + ) -> "DashAIDataset": + """Prepare a dataset for an image classification task. + + Parameters + ---------- + dataset : Union[DatasetDict, DashAIDataset] + Dataset to be prepared. + input_columns : List[str] + Names of the image input columns. + output_columns : List[str] + Names of the categorical output columns. + + Returns + ------- + DashAIDataset + The validated dataset, ready for training or inference. + """ + dashai_dataset = super().prepare_for_task( + dataset, input_columns, output_columns + ) + return dashai_dataset diff --git a/DashAI/front/src/components/datasets/DatasetModal.jsx b/DashAI/front/src/components/datasets/DatasetModal.jsx index 50478c287..b65db2da9 100644 --- a/DashAI/front/src/components/datasets/DatasetModal.jsx +++ b/DashAI/front/src/components/datasets/DatasetModal.jsx @@ -24,6 +24,8 @@ import { enqueueDatasetJob as enqueueDatasetRequest } from "../../api/job"; import DatasetPreviewStep from "./DatasetPreviewStep"; import { loadPreview } from "../../api/datasets"; +const SKIP_PREVIEW_DATALOADERS = new Set(["ImageDataLoader"]); + const steps = [ { name: "selectDataloader", label: "Select a way to upload" }, { name: "uploadDataset", label: "Configure and upload your dataset" }, @@ -203,13 +205,19 @@ function DatasetModal({ open, setOpen, updateDatasets }) { setActiveStep(stepIndex); }; + const skipPreview = SKIP_PREVIEW_DATALOADERS.has(newDataset.dataloader); + const handleNextButton = () => { if (activeStep === 0) { setActiveStep(activeStep + 1); setNextEnabled(false); } else if (activeStep === 1) { - handlePreviewDataset(); - setActiveStep(2); + if (skipPreview) { + handleSubmitNewDataset(); + } else { + handlePreviewDataset(); + setActiveStep(2); + } } else if (activeStep === 2) { handleSubmitNewDataset(); } @@ -365,7 +373,9 @@ function DatasetModal({ open, setOpen, updateDatasets }) { {activeStep === 0 ? "Next" : activeStep === 1 - ? "Preview" + ? skipPreview + ? "Upload" + : "Preview" : "Upload"} diff --git a/tests/back/models/test_mlp_image_classifier.py b/tests/back/models/test_mlp_image_classifier.py new file mode 100644 index 000000000..0bc7959cc --- /dev/null +++ b/tests/back/models/test_mlp_image_classifier.py @@ -0,0 +1,149 @@ +"""Tests for the MLP Image Classifier and Image Classification pipeline.""" + +import os +import shutil +import tempfile +import zipfile + +import numpy as np +import pytest +from PIL import Image + +from DashAI.back.dataloaders.classes.image_dataloader import ImageDataLoader +from DashAI.back.models.mlp_image_classifier import MLPImageClassifier + + +@pytest.fixture(scope="module") +def image_zip_path(): + """Create a temporary zip with synthetic images in imagefolder format. + + Structure: + class_0/img_0.png, img_1.png, ... + class_1/img_0.png, img_1.png, ... + """ + tmp_dir = tempfile.mkdtemp() + img_dir = os.path.join(tmp_dir, "images") + + num_classes = 3 + images_per_class = 10 + + for cls in range(num_classes): + cls_dir = os.path.join(img_dir, f"class_{cls}") + os.makedirs(cls_dir) + for i in range(images_per_class): + arr = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8) + img = Image.fromarray(arr) + img.save(os.path.join(cls_dir, f"img_{i}.png")) + + zip_path = os.path.join(tmp_dir, "test_images.zip") + with zipfile.ZipFile(zip_path, "w") as zf: + for root, _, files in os.walk(img_dir): + for f in files: + full = os.path.join(root, f) + arcname = os.path.relpath(full, img_dir) + zf.write(full, arcname) + + yield zip_path + + shutil.rmtree(tmp_dir) + + +@pytest.fixture(scope="module") +def loaded_dataset(image_zip_path): + """Load the synthetic image dataset using ImageDataLoader.""" + loader = ImageDataLoader() + temp_path = tempfile.mkdtemp() + try: + dataset = loader.load_data( + filepath_or_buffer=image_zip_path, + temp_path=temp_path, + params={}, + ) + return dataset + finally: + if os.path.exists(temp_path): + shutil.rmtree(temp_path) + + +def test_image_dataloader_loads_correctly(loaded_dataset): + assert loaded_dataset is not None + assert len(loaded_dataset) == 30 # 3 classes * 10 images + assert "image" in loaded_dataset.features + assert "label" in loaded_dataset.features + + +def test_mlp_image_classifier_train_and_predict(loaded_dataset): + from DashAI.back.dataloaders.classes.dashai_dataset import ( + select_columns, + split_dataset, + split_indexes, + ) + + total_rows = loaded_dataset.num_rows + train_idx, test_idx, val_idx = split_indexes( + total_rows=total_rows, + train_size=0.7, + test_size=0.15, + val_size=0.15, + ) + split_ds = split_dataset( + loaded_dataset, + train_indexes=train_idx, + test_indexes=test_idx, + val_indexes=val_idx, + ) + + x, y = select_columns(split_ds, ["image"], ["label"]) + x = split_dataset(x) + y = split_dataset(y) + + model = MLPImageClassifier(epochs=2, learning_rate=0.01, hidden_dims=[32]) + model.train(x["train"], y["train"]) + + predictions = model.predict(x["test"]) + assert isinstance(predictions, list) + assert len(predictions) == x["test"].num_rows + assert len(predictions[0]) == 3 # 3 classes + + +def test_mlp_image_classifier_save_and_load(loaded_dataset): + from DashAI.back.dataloaders.classes.dashai_dataset import ( + select_columns, + split_dataset, + split_indexes, + ) + + total_rows = loaded_dataset.num_rows + train_idx, test_idx, val_idx = split_indexes( + total_rows=total_rows, + train_size=0.7, + test_size=0.15, + val_size=0.15, + ) + split_ds = split_dataset( + loaded_dataset, + train_indexes=train_idx, + test_indexes=test_idx, + val_indexes=val_idx, + ) + + x, y = select_columns(split_ds, ["image"], ["label"]) + x = split_dataset(x) + y = split_dataset(y) + + model = MLPImageClassifier(epochs=1, learning_rate=0.01, hidden_dims=[32]) + model.train(x["train"], y["train"]) + + with tempfile.NamedTemporaryFile(suffix=".pt", delete=False) as f: + save_path = f.name + + try: + model.save(save_path) + loaded_model = MLPImageClassifier.load(save_path) + + original_preds = model.predict(x["test"]) + loaded_preds = loaded_model.predict(x["test"]) + + np.testing.assert_array_almost_equal(original_preds, loaded_preds, decimal=5) + finally: + os.remove(save_path) From 5ab1954d36fe08dc1f9d04f10c708662f2c1fe4d Mon Sep 17 00:00:00 2001 From: Creylay Date: Mon, 20 Apr 2026 18:42:11 -0400 Subject: [PATCH 002/361] feat: enhance dataset handling for image classification and update type inference --- .../dataloaders/classes/dashai_dataset.py | 41 ++++++-- .../dataloaders/classes/image_dataloader.py | 19 +++- DashAI/back/job/dataset_job.py | 5 + DashAI/back/models/mlp_image_classifier.py | 99 ++++++++++++++++--- DashAI/back/types/utils.py | 6 +- 5 files changed, 147 insertions(+), 23 deletions(-) diff --git a/DashAI/back/dataloaders/classes/dashai_dataset.py b/DashAI/back/dataloaders/classes/dashai_dataset.py index 571b7f3ed..202541e7b 100644 --- a/DashAI/back/dataloaders/classes/dashai_dataset.py +++ b/DashAI/back/dataloaders/classes/dashai_dataset.py @@ -438,26 +438,51 @@ def _compute_quality_metadata(self, dataset_df) -> dict: completeness = 1 - ( dataset_df.isna().sum().sum() / (len(dataset_df) * len(dataset_df.columns)) ) - duplicate_rows = int(dataset_df.duplicated().sum()) + + from DashAI.back.types.dashai_image import DashAIImage + + hashable_cols = [ + c + for c in dataset_df.columns + if not isinstance(self.types.get(c), DashAIImage) + ] + if hashable_cols: + duplicate_rows = int(dataset_df[hashable_cols].duplicated().sum()) + else: + duplicate_rows = 0 uniqueness = 1 - (duplicate_rows / len(dataset_df)) data_quality_score = float((completeness * 0.7 + uniqueness * 0.3) * 100) - # Compute unique counts - nunique_series = dataset_df.nunique(dropna=False) + # Compute unique counts (excluding image columns which are unhashable) + nunique_series = ( + dataset_df[hashable_cols].nunique(dropna=False) if hashable_cols else {} + ) categorical_keys = self._get_categorical_columns() - categorical_cols = dataset_df[categorical_keys] - nunique_categorical = categorical_cols.nunique(dropna=False) + # Filter categorical columns to only hashable ones + hashable_categorical_keys = [k for k in categorical_keys if k in hashable_cols] + categorical_cols = ( + dataset_df[hashable_categorical_keys] + if hashable_categorical_keys + else dataset_df[[]] + ) + nunique_categorical = ( + categorical_cols.nunique(dropna=False) if hashable_categorical_keys else {} + ) return { "constant_columns": [ - c for c in dataset_df.columns if int(nunique_series[c]) == 1 + c + for c in hashable_cols + if c in nunique_series and int(nunique_series[c]) == 1 ], "high_cardinality_columns": [ - c for c in categorical_cols.columns if int(nunique_categorical[c]) > 100 + c + for c in hashable_categorical_keys + if c in nunique_categorical and int(nunique_categorical[c]) > 100 ], "possible_id_columns": [ - c for c in dataset_df.columns if dataset_df[c].is_unique + c for c in hashable_cols if dataset_df[c].is_unique ], "nan_ratio_per_column": { c: float(dataset_df[c].isna().mean()) for c in dataset_df.columns diff --git a/DashAI/back/dataloaders/classes/image_dataloader.py b/DashAI/back/dataloaders/classes/image_dataloader.py index beb229f60..f006d4ad0 100644 --- a/DashAI/back/dataloaders/classes/image_dataloader.py +++ b/DashAI/back/dataloaders/classes/image_dataloader.py @@ -113,4 +113,21 @@ def convert_image_to_bytes(example): shutil.rmtree(prepared_path[0]) - return to_dashai_dataset(dataset) + from DashAI.back.types.categorical import Categorical + from DashAI.back.types.dashai_image import DashAIImage + + if isinstance(dataset, Dataset): + ds_for_types = dataset + else: + first_key = list(dataset.keys())[0] + ds_for_types = dataset[first_key] + + types = {} + for col in ds_for_types.column_names: + if col == "image": + types[col] = DashAIImage() + else: + unique_vals = sorted({v for v in ds_for_types[col] if v is not None}) + types[col] = Categorical(values=unique_vals, dtype="string") + + return to_dashai_dataset(dataset, types=types) diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 2ab1ceb98..4d2787a67 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -176,6 +176,11 @@ def run( if "inferred_types" in params: schema = params["inferred_types"] + elif new_dataset.types: + schema = { + col: typ.to_string() + for col, typ in new_dataset.types.items() + } else: schema = infer_types( new_dataset.to_pandas(), method="DashAIPtype" diff --git a/DashAI/back/models/mlp_image_classifier.py b/DashAI/back/models/mlp_image_classifier.py index 3e12427d8..b343247cc 100644 --- a/DashAI/back/models/mlp_image_classifier.py +++ b/DashAI/back/models/mlp_image_classifier.py @@ -86,22 +86,51 @@ def __init__(self, hf_dataset: datasets.Dataset, has_labels: bool = True): column_names[1] if has_labels and len(column_names) > 1 else None ) - self.tensor_shape = self.transforms(self.dataset[0][self.image_col_name]).shape + # Create label to index mapping if labels exist + self.label_to_idx = {} + self.idx_to_label = {} + if self.label_col_name: + unique_labels = sorted(set(self.dataset[self.label_col_name])) + self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)} + self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()} + + pil_image = self._get_pil_image(self.dataset[0][self.image_col_name]) + self.tensor_shape = self.transforms(pil_image).shape + + @staticmethod + def _get_pil_image(img_data): + """Convert image data (dict with bytes or PIL.Image) to PIL.Image.""" + import io + + from PIL import Image + + if isinstance(img_data, dict) and "bytes" in img_data: + # Image stored as bytes + buffer = io.BytesIO(img_data["bytes"]) + return Image.open(buffer) + elif hasattr(img_data, "format"): + # Already a PIL.Image + return img_data + else: + raise TypeError(f"Unsupported image data type: {type(img_data)}") def num_classes(self): if self.label_col_name is None: return 0 - return len(set(self.dataset[self.label_col_name])) + return len(self.label_to_idx) def __len__(self): return len(self.dataset) def __getitem__(self, idx): - image = self.transforms(self.dataset[idx][self.image_col_name]) + pil_image = self._get_pil_image(self.dataset[idx][self.image_col_name]) + image = self.transforms(pil_image) if self.label_col_name is None: return image - label = self.dataset[idx][self.label_col_name] - return image, label + # Convert label string to index + label_str = self.dataset[idx][self.label_col_name] + label_idx = self.label_to_idx[label_str] + return image, label_idx class _MLP(nn.Module): @@ -157,6 +186,18 @@ class MLPImageClassifier(BaseModel): COLOR: str = "#E91E63" ICON: str = "Image" + @staticmethod + def _collate_fn_with_labels(batch): + """Custom collate function for batches with (image, label) tuples.""" + images = torch.stack([item[0] for item in batch]) + labels = torch.tensor([item[1] for item in batch], dtype=torch.long) + return images, labels + + @staticmethod + def _collate_fn_no_labels(batch): + """Custom collate function for batches with only images.""" + return torch.stack(batch) + def __init__(self, epochs=10, learning_rate=0.001, hidden_dims=None, **kwargs): if hidden_dims is None: hidden_dims = [128, 64] @@ -168,6 +209,23 @@ def __init__(self, epochs=10, learning_rate=0.001, hidden_dims=None, **kwargs): self.optimizer = None self.input_dim = None self.output_dim = None + self.idx_to_label = {} + self.label_to_idx = {} + + def prepare_output(self, dataset, is_fit=False): + """Encode string labels to integer indices matching the model's class order.""" + import pyarrow as pa + + from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset + + if not self.label_to_idx: + return dataset + + col_name = dataset.column_names[0] + labels = dataset[col_name] + encoded = [self.label_to_idx.get(label, label) for label in labels] + table = pa.table({col_name: encoded}) + return DashAIDataset(table) def train(self, x_train, y_train, x_validation=None, y_validation=None): """Train the MLP on the provided image dataset. @@ -206,8 +264,14 @@ def train(self, x_train, y_train, x_validation=None, y_validation=None): ) self.output_dim = image_dataset.num_classes() + self.idx_to_label = image_dataset.idx_to_label + self.label_to_idx = image_dataset.label_to_idx + train_loader = torch.utils.data.DataLoader( - image_dataset, batch_size=32, shuffle=True + image_dataset, + batch_size=32, + shuffle=True, + collate_fn=self._collate_fn_with_labels, ) self.model = _MLP(self.input_dim, self.output_dim, self.hidden_dims).to( @@ -238,24 +302,29 @@ def predict(self, x): Returns ------- - list - List of predicted probabilities for each class. + list of lists + List of predicted probabilities for each class for each image. """ image_col = list(x.features.keys())[0] hf_dataset = datasets.Dataset.from_dict({"image": x[image_col]}) image_dataset = _ImageDataset(hf_dataset, has_labels=False) test_loader = torch.utils.data.DataLoader( - image_dataset, batch_size=32, shuffle=False + image_dataset, + batch_size=32, + shuffle=False, + collate_fn=self._collate_fn_no_labels, ) self.model.eval() - probs_predicted = [] + all_probs = [] with torch.no_grad(): for images in test_loader: images = images.to(self.device) - output_probs = self.model(images) - probs_predicted += output_probs.tolist() - return probs_predicted + logits = self.model(images) + probs = torch.softmax(logits, dim=1) + all_probs += probs.cpu().tolist() + + return all_probs def save(self, filename: str) -> None: """Save the model checkpoint to disk. @@ -273,6 +342,8 @@ def save(self, filename: str) -> None: "hidden_dims": self.hidden_dims, "input_dim": self.input_dim, "output_dim": self.output_dim, + "idx_to_label": self.idx_to_label, + "label_to_idx": self.label_to_idx, } torch.save(checkpoint, filename) @@ -304,4 +375,6 @@ def load(cls, filename: str): instance.model.load_state_dict(checkpoint["model_state_dict"]) instance.optimizer = optim.Adam(instance.model.parameters()) instance.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + instance.idx_to_label = checkpoint.get("idx_to_label", {}) + instance.label_to_idx = checkpoint.get("label_to_idx", {}) return instance diff --git a/DashAI/back/types/utils.py b/DashAI/back/types/utils.py index 7acbe0ec5..86dfd87a6 100644 --- a/DashAI/back/types/utils.py +++ b/DashAI/back/types/utils.py @@ -225,7 +225,11 @@ def get_types_from_arrow_metadata( dtype=dtype, encoder=encoder, ) - # Future implementation for images, modify as needed + elif _type == "Image": + from DashAI.back.types.dashai_image import DashAIImage + + dtype = info.get("dtype", "string") + dashai_types[column] = DashAIImage(dtype=dtype) else: dtype = info.get("dtype") dtype_map = _get_dtype_arrow_map() From 8a2a969d5eecbff07b69e70296f9778a5989fec6 Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 09:33:41 -0400 Subject: [PATCH 003/361] feat: add image preview functionality and enhance dataset table rendering --- DashAI/back/api/api_v1/endpoints/datasets.py | 98 +++++++++++++++++-- .../src/components/datasets/DatasetModal.jsx | 17 ++-- .../datasets/DatasetPreviewTable.jsx | 21 ++++ .../notebooks/dataset/DatasetTable.jsx | 27 +++++ .../datasetCreation/PreviewDatasetTable.jsx | 26 +++++ 5 files changed, 169 insertions(+), 20 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 55d943a06..bd9c792fd 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -29,6 +29,49 @@ from DashAI.back.dependencies.registry import ComponentRegistry +def _build_image_preview_sample( + extract_dir: str, image_extensions: set, max_rows: int = 5 +) -> list: + """Walk an extracted imagefolder directory and return sample rows.""" + import os + + samples = [] + for root, _, files in os.walk(extract_dir): + for f in sorted(files): + ext = os.path.splitext(f)[1].lower() + if ext not in image_extensions: + continue + filepath = os.path.join(root, f) + label = os.path.basename(root) + try: + with open(filepath, "rb") as fh: + thumb = _image_bytes_to_thumbnail_data_uri(fh.read()) + samples.append({"image": thumb, "label": label}) + except Exception: + continue + if len(samples) >= max_rows: + return samples + return samples + + +def _image_bytes_to_thumbnail_data_uri(img_bytes: bytes, max_size: int = 64) -> str: + """Convert raw image bytes to a small base64 data URI thumbnail.""" + import base64 + import io + + from PIL import Image + + try: + img = Image.open(io.BytesIO(img_bytes)) + img.thumbnail((max_size, max_size)) + buf = io.BytesIO() + img.save(buf, format="PNG") + b64 = base64.b64encode(buf.getvalue()).decode() + return f"data:image/png;base64,{b64}" + except Exception: + return "[Image]" + + logger = logging.getLogger(__name__) router = APIRouter() @@ -277,10 +320,28 @@ async def filter_dataset_file( start = page * page_size paged_table = table.slice(start, page_size) - rows = [ - {col: paged_table[col][i].as_py() for col in paged_table.schema.names} - for i in range(paged_table.num_rows) - ] + + image_cols = { + col + for col in paged_table.schema.names + if pa.types.is_struct(paged_table.schema.field(col).type) + } + + rows = [] + for i in range(paged_table.num_rows): + row = {} + for col in paged_table.schema.names: + val = paged_table[col][i].as_py() + if col in image_cols and isinstance(val, dict): + img_bytes = val.get("bytes", b"") + row[col] = ( + _image_bytes_to_thumbnail_data_uri(img_bytes) + if img_bytes + else "[Image]" + ) + else: + row[col] = val + rows.append(row) return JSONResponse(content={"rows": rows, "total": total}) @@ -1380,11 +1441,25 @@ async def get_dataset_file( slice_end = min(batch.num_rows, end - batch_start) sliced_batch = batch.slice(slice_start, slice_end - slice_start) + image_cols = { + col + for col in sliced_batch.schema.names + if pa.types.is_struct(sliced_batch.schema.field(col).type) + } + for j in range(sliced_batch.num_rows): - row = { - col: sliced_batch[col][j].as_py() - for col in sliced_batch.schema.names - } + row = {} + for col in sliced_batch.schema.names: + val = sliced_batch[col][j].as_py() + if col in image_cols and isinstance(val, dict): + img_bytes = val.get("bytes", b"") + row[col] = ( + _image_bytes_to_thumbnail_data_uri(img_bytes) + if img_bytes + else "[Image]" + ) + else: + row[col] = val rows.append(row) rows_collected += 1 if rows_collected >= page_size: @@ -1669,10 +1744,13 @@ async def preview_with_types( break if dataloader_name is None and has_images: + sample_rows = _build_image_preview_sample( + extract_dir, image_extensions, max_rows=5 + ) shutil.rmtree(extract_dir, ignore_errors=True) os.unlink(tmp_file_path) return { - "sample": [], + "sample": sample_rows, "schema": { "image": {"type": "Image", "dtype": "string"}, "label": { @@ -1687,7 +1765,7 @@ async def preview_with_types( "dtype": "string", }, }, - "preview_row_count": 0, + "preview_row_count": len(sample_rows), } if dataloader_name is None: diff --git a/DashAI/front/src/components/datasets/DatasetModal.jsx b/DashAI/front/src/components/datasets/DatasetModal.jsx index b65db2da9..eee632f87 100644 --- a/DashAI/front/src/components/datasets/DatasetModal.jsx +++ b/DashAI/front/src/components/datasets/DatasetModal.jsx @@ -22,9 +22,9 @@ import ConfigureAndUploadDataset from "./ConfigureAndUploadDataset"; import { useSnackbar } from "notistack"; import { enqueueDatasetJob as enqueueDatasetRequest } from "../../api/job"; import DatasetPreviewStep from "./DatasetPreviewStep"; -import { loadPreview } from "../../api/datasets"; +import { previewWithTypes } from "../../api/datasets"; -const SKIP_PREVIEW_DATALOADERS = new Set(["ImageDataLoader"]); +const SKIP_PREVIEW_DATALOADERS = new Set([]); const steps = [ { name: "selectDataloader", label: "Select a way to upload" }, @@ -71,7 +71,7 @@ function DatasetModal({ open, setOpen, updateDatasets }) { newDataset.params["dataloader"] = newDataset.dataloader; await enqueueDatasetRequest(newDataset.file, name, newDataset.url, { ...newDataset.params, - schema: columnsSpec, + inferred_types: columnsSpec, }); enqueueSnackbar("Dataset upload job started", { variant: "success" }); @@ -92,17 +92,14 @@ function DatasetModal({ open, setOpen, updateDatasets }) { formData.append("params", JSON.stringify(newDataset.params)); try { - const preview = await loadPreview(formData); + const preview = await previewWithTypes(formData); setPreviewData(preview); - //Save the columns spec to be used in the preview table + const source = preview.inferred_types || preview.schema; const initialColumnsSpec = {}; - Object.keys(preview.schema).forEach((columnName) => { - initialColumnsSpec[columnName] = { - type: preview.schema[columnName].type, - dtype: preview.schema[columnName].dtype, - }; + Object.keys(source).forEach((columnName) => { + initialColumnsSpec[columnName] = { ...source[columnName] }; }); setColumnsSpec(initialColumnsSpec); diff --git a/DashAI/front/src/components/datasets/DatasetPreviewTable.jsx b/DashAI/front/src/components/datasets/DatasetPreviewTable.jsx index f5dbd2787..ce4c9ebc6 100644 --- a/DashAI/front/src/components/datasets/DatasetPreviewTable.jsx +++ b/DashAI/front/src/components/datasets/DatasetPreviewTable.jsx @@ -86,6 +86,27 @@ function DatasetPreviewTable({ accessorKey: "example", header: "Example", size: 200, + Cell: ({ cell, row }) => { + const val = cell.getValue(); + if ( + row.original.columnType === "Image" && + typeof val === "string" && + val.startsWith("data:image") + ) { + return ( + preview + ); + } + return String(val ?? ""); + }, }, { accessorKey: "columnType", diff --git a/DashAI/front/src/components/notebooks/dataset/DatasetTable.jsx b/DashAI/front/src/components/notebooks/dataset/DatasetTable.jsx index bf33c02e6..4c5e40e13 100644 --- a/DashAI/front/src/components/notebooks/dataset/DatasetTable.jsx +++ b/DashAI/front/src/components/notebooks/dataset/DatasetTable.jsx @@ -351,11 +351,38 @@ export default function DatasetTable({ const colTypeRaw = columnTypes[key]; const colType = typeof colTypeRaw === "string" ? colTypeRaw : (colTypeRaw?.type ?? ""); + const isImage = + colType === "Image" || + (data.length > 0 && + typeof data[0][key] === "string" && + data[0][key].startsWith("data:image")); const filterVariant = "text"; return { accessorKey: key, header: key, filterVariant, + enableColumnFilter: !isImage, + enableSorting: !isImage, + ...(isImage && { + Cell: ({ cell }) => { + const val = cell.getValue(); + if (typeof val === "string" && val.startsWith("data:image")) { + return ( + img + ); + } + return val; + }, + size: 80, + }), filterFn: ["Integer", "Float"].includes(colType) ? "between" : "contains", diff --git a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx index 06c4e2646..b2463ea49 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx @@ -30,6 +30,7 @@ const TYPE_TO_DEFAULT_DTYPE = { // Duration: "duration(us)", Decimal: "decimal128(8, 0)", Binary: "binary", + Image: "string", // Boolean: "bool", // Boolean is always Categorical }; @@ -207,6 +208,10 @@ export default function PreviewDatasetTable({ return Object.keys(firstRow).map((field) => { const columnType = columnTypes[field]; const displayName = columnNames[field] || field; + const isImage = + columnType?.type === "Image" || + (typeof firstRow[field] === "string" && + firstRow[field].startsWith("data:image")); return { accessorKey: field, @@ -215,6 +220,26 @@ export default function PreviewDatasetTable({ grow: 1, enableSorting: false, enableColumnActions: false, + ...(isImage && { + Cell: ({ cell }) => { + const val = cell.getValue(); + if (typeof val === "string" && val.startsWith("data:image")) { + return ( + img + ); + } + return val; + }, + size: 80, + }), Header: () => ( Float Text Categorical + Image {columnType?.type === "Categorical" && columnType?.encoder && ( From 84cc0b5e0dc407c6292abeb1904a2bc5037a4e41 Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 09:56:22 -0400 Subject: [PATCH 004/361] feat: add one-hot encoder to label in preview_with_types response --- DashAI/back/api/api_v1/endpoints/datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index bd9c792fd..386c97c89 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -1763,6 +1763,7 @@ async def preview_with_types( "label": { "type": "Categorical", "dtype": "string", + "encoder": "one_hot", }, }, "preview_row_count": len(sample_rows), From 62cc43263aad14837aebd59a265069a2b64dc727 Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 10:25:23 -0400 Subject: [PATCH 005/361] feat: enhance label conversion in image dataset loading --- .../dataloaders/classes/image_dataloader.py | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/DashAI/back/dataloaders/classes/image_dataloader.py b/DashAI/back/dataloaders/classes/image_dataloader.py index f006d4ad0..d6a54e242 100644 --- a/DashAI/back/dataloaders/classes/image_dataloader.py +++ b/DashAI/back/dataloaders/classes/image_dataloader.py @@ -111,23 +111,61 @@ def convert_image_to_bytes(example): dataset = dataset.map(convert_image_to_bytes) + # Convert ClassLabel columns (integers) back to their string names. + # HF ClassLabel silently casts map() outputs back to int, so we need + # to build a new dataset with the strings directly. + from datasets import ClassLabel as HFClassLabel + from datasets import Features, Value + + if isinstance(dataset, Dataset): + ds_ref = dataset + else: + first_key = list(dataset.keys())[0] + ds_ref = dataset[first_key] + + classlabel_cols = {} + for col in ds_ref.column_names: + feat = ds_ref.features.get(col) + if isinstance(feat, HFClassLabel): + classlabel_cols[col] = feat.names + + if classlabel_cols: + new_features = Features( + { + col: Value("string") if col in classlabel_cols else feat + for col, feat in ds_ref.features.items() + } + ) + + def convert_labels(example): + for col, names in classlabel_cols.items(): + example[col] = names[example[col]] + return example + + if isinstance(dataset, Dataset): + dataset = dataset.map(convert_labels, features=new_features) + else: + for split_name in list(dataset.keys()): + dataset[split_name] = dataset[split_name].map( + convert_labels, features=new_features + ) + ds_ref = dataset[first_key] + shutil.rmtree(prepared_path[0]) from DashAI.back.types.categorical import Categorical from DashAI.back.types.dashai_image import DashAIImage - if isinstance(dataset, Dataset): - ds_for_types = dataset - else: - first_key = list(dataset.keys())[0] - ds_for_types = dataset[first_key] + ds_for_types = dataset if isinstance(dataset, Dataset) else ds_ref types = {} for col in ds_for_types.column_names: if col == "image": types[col] = DashAIImage() else: - unique_vals = sorted({v for v in ds_for_types[col] if v is not None}) + unique_vals = sorted( + {str(v) for v in ds_for_types[col] if v is not None} + ) types[col] = Categorical(values=unique_vals, dtype="string") return to_dashai_dataset(dataset, types=types) From 1d0bc63256b3a155a7494fb538797bc8b48a5593 Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 10:31:26 -0400 Subject: [PATCH 006/361] feat: improve duplicate row detection by checking categorical types in hashable columns --- DashAI/back/dataloaders/classes/dashai_dataset.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/DashAI/back/dataloaders/classes/dashai_dataset.py b/DashAI/back/dataloaders/classes/dashai_dataset.py index 202541e7b..c94dc49dc 100644 --- a/DashAI/back/dataloaders/classes/dashai_dataset.py +++ b/DashAI/back/dataloaders/classes/dashai_dataset.py @@ -247,7 +247,10 @@ def _compute_general_info(self, dataset_df) -> dict: for c in dataset_df.columns if not isinstance(self.types.get(c), DashAIImage) ] - if hashable_cols: + all_categorical = hashable_cols and all( + isinstance(self.types.get(c), Categorical) for c in hashable_cols + ) + if hashable_cols and not all_categorical: duplicate_rows = int(dataset_df[hashable_cols].duplicated().sum()) else: duplicate_rows = 0 @@ -446,7 +449,10 @@ def _compute_quality_metadata(self, dataset_df) -> dict: for c in dataset_df.columns if not isinstance(self.types.get(c), DashAIImage) ] - if hashable_cols: + all_categorical = hashable_cols and all( + isinstance(self.types.get(c), Categorical) for c in hashable_cols + ) + if hashable_cols and not all_categorical: duplicate_rows = int(dataset_df[hashable_cols].duplicated().sum()) else: duplicate_rows = 0 From 214575ac36ac1bf03906ce5bd11197ffd2db353c Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 11:19:46 -0400 Subject: [PATCH 007/361] feat: enhance image preview and export functionality, add localization for row display messages --- DashAI/back/api/api_v1/endpoints/datasets.py | 208 +++++++++++++++--- .../src/components/models/PredictionCard.jsx | 8 +- .../datasetCreation/PreviewDataset.jsx | 13 +- .../datasetCreation/PreviewDatasetTable.jsx | 6 + .../predictions/PredictionModal.jsx | 8 +- .../src/utils/i18n/locales/en/datasets.json | 1 + .../src/utils/i18n/locales/es/datasets.json | 1 + 7 files changed, 207 insertions(+), 38 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 386c97c89..01290ef14 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -31,16 +31,21 @@ def _build_image_preview_sample( extract_dir: str, image_extensions: set, max_rows: int = 5 -) -> list: - """Walk an extracted imagefolder directory and return sample rows.""" +) -> tuple: + """Walk an extracted imagefolder directory and return sample rows + and total count.""" import os samples = [] + total = 0 for root, _, files in os.walk(extract_dir): for f in sorted(files): ext = os.path.splitext(f)[1].lower() if ext not in image_extensions: continue + total += 1 + if len(samples) >= max_rows: + continue filepath = os.path.join(root, f) label = os.path.basename(root) try: @@ -49,9 +54,7 @@ def _build_image_preview_sample( samples.append({"image": thumb, "label": label}) except Exception: continue - if len(samples) >= max_rows: - return samples - return samples + return samples, total def _image_bytes_to_thumbnail_data_uri(img_bytes: bytes, max_size: int = 64) -> str: @@ -1524,20 +1527,94 @@ async def export_dataset_as_csv( table = pa.Table.from_batches(batches) - # Convert to CSV - output = io.BytesIO() - csv.write_csv(table, output) - output.seek(0) + # Detect image columns (struct with binary bytes) + image_cols = [ + col + for col in table.column_names + if pa.types.is_struct(table.schema.field(col).type) + ] - # Get dataset name from path for filename dataset_name = os.path.basename(path.rstrip("/")) - filename = f"{dataset_name}.csv" - return StreamingResponse( - io.BytesIO(output.getvalue()), - media_type="text/csv", - headers={"Content-Disposition": f"attachment; filename={filename}"}, - ) + if image_cols: + # Export as ZIP: images as files + CSV with other columns + import zipfile + + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: + # Save images and build filename references + image_filenames = {col: [] for col in image_cols} + num_rows = table.num_rows + for col in image_cols: + struct_col = table.column(col) + for i in range(num_rows): + struct_val = struct_col[i].as_py() + if struct_val and struct_val.get("bytes"): + img_bytes = struct_val["bytes"] + fmt = (struct_val.get("format", "PNG") or "PNG").lower() + ext = "jpg" if fmt == "jpeg" else fmt + fname = f"images/{col}_{i}.{ext}" + zf.writestr(fname, img_bytes) + image_filenames[col].append(fname) + else: + image_filenames[col].append("") + + # Build CSV without image columns but with filename refs + csv_table = table.drop(image_cols) + for col in image_cols: + csv_table = csv_table.append_column( + f"{col}_file", + pa.array(image_filenames[col], type=pa.string()), + ) + + # Drop any remaining binary columns + remaining_drop = [ + c + for c in csv_table.column_names + if pa.types.is_binary(csv_table.schema.field(c).type) + or pa.types.is_large_binary(csv_table.schema.field(c).type) + ] + if remaining_drop: + csv_table = csv_table.drop(remaining_drop) + + csv_output = io.BytesIO() + csv.write_csv(csv_table, csv_output) + zf.writestr("data.csv", csv_output.getvalue()) + + zip_buffer.seek(0) + filename = f"{dataset_name}.zip" + + return StreamingResponse( + zip_buffer, + media_type="application/zip", + headers={ + "Content-Disposition": (f"attachment; filename={filename}") + }, + ) + else: + # No image columns: export as plain CSV + drop_cols = [ + col + for col in table.column_names + if pa.types.is_binary(table.schema.field(col).type) + or pa.types.is_large_binary(table.schema.field(col).type) + ] + if drop_cols: + table = table.drop(drop_cols) + + output = io.BytesIO() + csv.write_csv(table, output) + output.seek(0) + + filename = f"{dataset_name}.csv" + + return StreamingResponse( + io.BytesIO(output.getvalue()), + media_type="text/csv", + headers={ + "Content-Disposition": (f"attachment; filename={filename}") + }, + ) except FileNotFoundError as e: raise HTTPException( @@ -1626,19 +1703,88 @@ async def export_dataset_csv_by_id( table = pa.Table.from_batches(batches) - # Convert to CSV - output = io.BytesIO() - csv.write_csv(table, output) - output.seek(0) - - # Use dataset name for filename - filename = f"{dataset.name}.csv" + image_cols = [ + col + for col in table.column_names + if pa.types.is_struct(table.schema.field(col).type) + ] - return StreamingResponse( - io.BytesIO(output.getvalue()), - media_type="text/csv", - headers={"Content-Disposition": f"attachment; filename={filename}"}, - ) + if image_cols: + import zipfile + + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: + image_filenames = {col: [] for col in image_cols} + num_rows = table.num_rows + for col in image_cols: + struct_col = table.column(col) + for i in range(num_rows): + struct_val = struct_col[i].as_py() + if struct_val and struct_val.get("bytes"): + img_bytes = struct_val["bytes"] + fmt = ( + struct_val.get("format", "PNG") or "PNG" + ).lower() + ext = "jpg" if fmt == "jpeg" else fmt + fname = f"images/{col}_{i}.{ext}" + zf.writestr(fname, img_bytes) + image_filenames[col].append(fname) + else: + image_filenames[col].append("") + + csv_table = table.drop(image_cols) + for col in image_cols: + csv_table = csv_table.append_column( + f"{col}_file", + pa.array(image_filenames[col], type=pa.string()), + ) + + remaining_drop = [ + c + for c in csv_table.column_names + if pa.types.is_binary(csv_table.schema.field(c).type) + or pa.types.is_large_binary(csv_table.schema.field(c).type) + ] + if remaining_drop: + csv_table = csv_table.drop(remaining_drop) + + csv_output = io.BytesIO() + csv.write_csv(csv_table, csv_output) + zf.writestr("data.csv", csv_output.getvalue()) + + zip_buffer.seek(0) + filename = f"{dataset.name}.zip" + + return StreamingResponse( + zip_buffer, + media_type="application/zip", + headers={ + "Content-Disposition": (f"attachment; filename={filename}") + }, + ) + else: + drop_cols = [ + col + for col in table.column_names + if pa.types.is_binary(table.schema.field(col).type) + or pa.types.is_large_binary(table.schema.field(col).type) + ] + if drop_cols: + table = table.drop(drop_cols) + + output = io.BytesIO() + csv.write_csv(table, output) + output.seek(0) + + filename = f"{dataset.name}.csv" + + return StreamingResponse( + io.BytesIO(output.getvalue()), + media_type="text/csv", + headers={ + "Content-Disposition": (f"attachment; filename={filename}") + }, + ) except exc.SQLAlchemyError as e: logger.exception(e) @@ -1744,7 +1890,7 @@ async def preview_with_types( break if dataloader_name is None and has_images: - sample_rows = _build_image_preview_sample( + sample_rows, total_images = _build_image_preview_sample( extract_dir, image_extensions, max_rows=5 ) shutil.rmtree(extract_dir, ignore_errors=True) @@ -1766,7 +1912,8 @@ async def preview_with_types( "encoder": "one_hot", }, }, - "preview_row_count": len(sample_rows), + "preview_row_count": total_images, + "types_inferred": False, } if dataloader_name is None: @@ -1867,6 +2014,7 @@ async def preview_with_types( "schema": arrow_schema, "inferred_types": inferred_types, "preview_row_count": len(loaded_dataset), + "types_inferred": True, } finally: diff --git a/DashAI/front/src/components/models/PredictionCard.jsx b/DashAI/front/src/components/models/PredictionCard.jsx index ee56493f2..bcc59f615 100644 --- a/DashAI/front/src/components/models/PredictionCard.jsx +++ b/DashAI/front/src/components/models/PredictionCard.jsx @@ -116,7 +116,11 @@ export default function PredictionCard({ prediction, onDelete, onUpdate }) { const handleDownload = async () => { try { const data = await exportDatasetCsvByPath(prediction.results_path); - const blob = new Blob([data], { type: "text/csv;charset=utf-8;" }); + const isZip = data.type === "application/zip"; + const blob = new Blob([data], { + type: isZip ? "application/zip" : "text/csv;charset=utf-8;", + }); + const ext = isZip ? "zip" : "csv"; const url = URL.createObjectURL(blob); const link = document.createElement("a"); link.href = url; @@ -124,7 +128,7 @@ export default function PredictionCard({ prediction, onDelete, onUpdate }) { "download", `prediction-${prediction.id}-${ new Date(prediction.created).toISOString().split("T")[0] - }.csv`, + }.${ext}`, ); document.body.appendChild(link); link.click(); diff --git a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx index 70ade95cc..1c62b177b 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx @@ -243,10 +243,15 @@ function PreviewDataset({ }} > - {t("datasets:label.showingRowsInference", { - sampleLength: previewData.sample.length, - previewRowCount: previewData.preview_row_count, - })} + {t( + previewData.types_inferred === false + ? "datasets:label.showingRowsPreview" + : "datasets:label.showingRowsInference", + { + sampleLength: previewData.sample.length, + previewRowCount: previewData.preview_row_count, + }, + )}
{t("datasets:label.changeColumnTypesInfo")}
diff --git a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx index 3b46896cd..3463ee610 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDatasetTable.jsx @@ -358,6 +358,12 @@ export default function PreviewDatasetTable({ }, muiTablePaperProps: { elevation: 0 }, paginationDisplayMode: "pages", + enableColumnFilters: false, + enableGlobalFilter: false, + enableDensityToggle: false, + enableFullScreenToggle: false, + enableHiding: false, + enableTopToolbar: false, }); return ( diff --git a/DashAI/front/src/components/predictions/PredictionModal.jsx b/DashAI/front/src/components/predictions/PredictionModal.jsx index c38a6cd25..1fdd4dc5c 100644 --- a/DashAI/front/src/components/predictions/PredictionModal.jsx +++ b/DashAI/front/src/components/predictions/PredictionModal.jsx @@ -249,13 +249,17 @@ export default function PredictionModal({ isOpen, onClose, run }) { const handleDownload = async (selectedPrediction) => { const data = await exportDatasetCsvByPath(selectedPrediction.results_path); - const blob = new Blob([data], { type: "text/csv;charset=utf-8;" }); + const isZip = data.type === "application/zip"; + const blob = new Blob([data], { + type: isZip ? "application/zip" : "text/csv;charset=utf-8;", + }); + const ext = isZip ? "zip" : "csv"; const url = URL.createObjectURL(blob); const link = document.createElement("a"); link.href = url; link.setAttribute( "download", - `${"prediction-" + selectedPrediction.created}.csv`, + `${"prediction-" + selectedPrediction.created}.${ext}`, ); document.body.appendChild(link); link.click(); diff --git a/DashAI/front/src/utils/i18n/locales/en/datasets.json b/DashAI/front/src/utils/i18n/locales/en/datasets.json index ad0d092e1..6dc1b4f85 100644 --- a/DashAI/front/src/utils/i18n/locales/en/datasets.json +++ b/DashAI/front/src/utils/i18n/locales/en/datasets.json @@ -318,6 +318,7 @@ "shapeIndicators": "Shape Indicators", "showGrid": "{{axis}} Axis Show Grid", "showingRowsInference": "Showing {{sampleLength}} of {{previewRowCount}} rows analyzed for type inference.", + "showingRowsPreview": "Showing {{sampleLength}} of {{previewRowCount}} rows.", "showLegend": "Show Legend", "showZeroLine": "{{axis}} Axis Show Zero Line", "skewness": "Skewness", diff --git a/DashAI/front/src/utils/i18n/locales/es/datasets.json b/DashAI/front/src/utils/i18n/locales/es/datasets.json index cbae3ac39..b70093509 100644 --- a/DashAI/front/src/utils/i18n/locales/es/datasets.json +++ b/DashAI/front/src/utils/i18n/locales/es/datasets.json @@ -323,6 +323,7 @@ "shapeIndicators": "Indicadores de Forma", "showGrid": "Mostrar Cuadrícula del Eje {{axis}}", "showingRowsInference": "Mostrando {{sampleLength}} de {{previewRowCount}} filas analizadas para inferencia de tipo.", + "showingRowsPreview": "Mostrando {{sampleLength}} de {{previewRowCount}} filas.", "showLegend": "Mostrar Leyenda", "showZeroLine": "Mostrar Línea Cero del Eje {{axis}}", "skewness": "Asimetría", From 95ebd217c611b0797c0dc902ad716bcf8d306594 Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 11:29:45 -0400 Subject: [PATCH 008/361] feat: enhance metadata display by mapping class names to user-friendly labels --- DashAI/back/tasks/base_task.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/DashAI/back/tasks/base_task.py b/DashAI/back/tasks/base_task.py index dd473de4b..b2e88d9b7 100644 --- a/DashAI/back/tasks/base_task.py +++ b/DashAI/back/tasks/base_task.py @@ -61,11 +61,16 @@ def get_metadata(cls) -> Dict[str, Any]: """ metadata = cls.metadata - # Extract class names - inputs_types = [input_type.__name__ for input_type in metadata["inputs_types"]] - outputs_types = [ - output_type.__name__ for output_type in metadata["outputs_types"] - ] + _CLASS_NAME_TO_DISPLAY = { + "DashAIImage": "Image", + } + + def _type_display_name(t): + name = t.__name__ + return _CLASS_NAME_TO_DISPLAY.get(name, name) + + inputs_types = [_type_display_name(t) for t in metadata["inputs_types"]] + outputs_types = [_type_display_name(t) for t in metadata["outputs_types"]] parsed_metadata: dict = { "inputs_types": inputs_types, From 85968cb7089ce55c3b81c442b109ca6cade96a51 Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 15:20:25 -0400 Subject: [PATCH 009/361] feat: enhance image handling in dataset processing and prediction workflows --- DashAI/back/api/api_v1/endpoints/datasets.py | 43 ++++++++--- DashAI/back/api/api_v1/endpoints/jobs.py | 26 +++++++ DashAI/back/job/predict_job.py | 12 ++- DashAI/back/models/mlp_image_classifier.py | 5 +- DashAI/back/tasks/base_task.py | 18 ++++- DashAI/front/src/api/job.ts | 4 +- .../models/ManualPredictionPanel.jsx | 20 ++++- .../src/components/predictions/InputField.jsx | 76 +++++++++++++------ .../predictions/ManualInputForm.jsx | 4 +- .../predictions/PredictionModal.jsx | 11 ++- .../src/utils/i18n/locales/en/prediction.json | 2 + .../src/utils/i18n/locales/es/prediction.json | 2 + 12 files changed, 178 insertions(+), 45 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 01290ef14..adedd6f0a 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -328,6 +328,8 @@ async def filter_dataset_file( col for col in paged_table.schema.names if pa.types.is_struct(paged_table.schema.field(col).type) + or pa.types.is_large_binary(paged_table.schema.field(col).type) + or pa.types.is_binary(paged_table.schema.field(col).type) } rows = [] @@ -335,8 +337,13 @@ async def filter_dataset_file( row = {} for col in paged_table.schema.names: val = paged_table[col][i].as_py() - if col in image_cols and isinstance(val, dict): - img_bytes = val.get("bytes", b"") + if col in image_cols: + if isinstance(val, dict): + img_bytes = val.get("bytes", b"") + elif isinstance(val, bytes): + img_bytes = val + else: + img_bytes = b"" row[col] = ( _image_bytes_to_thumbnail_data_uri(img_bytes) if img_bytes @@ -1448,14 +1455,21 @@ async def get_dataset_file( col for col in sliced_batch.schema.names if pa.types.is_struct(sliced_batch.schema.field(col).type) + or pa.types.is_large_binary(sliced_batch.schema.field(col).type) + or pa.types.is_binary(sliced_batch.schema.field(col).type) } for j in range(sliced_batch.num_rows): row = {} for col in sliced_batch.schema.names: val = sliced_batch[col][j].as_py() - if col in image_cols and isinstance(val, dict): - img_bytes = val.get("bytes", b"") + if col in image_cols: + if isinstance(val, dict): + img_bytes = val.get("bytes", b"") + elif isinstance(val, bytes): + img_bytes = val + else: + img_bytes = b"" row[col] = ( _image_bytes_to_thumbnail_data_uri(img_bytes) if img_bytes @@ -1527,11 +1541,13 @@ async def export_dataset_as_csv( table = pa.Table.from_batches(batches) - # Detect image columns (struct with binary bytes) + # Detect image columns (struct or raw binary) image_cols = [ col for col in table.column_names if pa.types.is_struct(table.schema.field(col).type) + or pa.types.is_binary(table.schema.field(col).type) + or pa.types.is_large_binary(table.schema.field(col).type) ] dataset_name = os.path.basename(path.rstrip("/")) @@ -1546,13 +1562,20 @@ async def export_dataset_as_csv( image_filenames = {col: [] for col in image_cols} num_rows = table.num_rows for col in image_cols: - struct_col = table.column(col) + arr_col = table.column(col) for i in range(num_rows): - struct_val = struct_col[i].as_py() - if struct_val and struct_val.get("bytes"): - img_bytes = struct_val["bytes"] - fmt = (struct_val.get("format", "PNG") or "PNG").lower() + val = arr_col[i].as_py() + if isinstance(val, dict) and val.get("bytes"): + img_bytes = val["bytes"] + fmt = (val.get("format", "PNG") or "PNG").lower() ext = "jpg" if fmt == "jpeg" else fmt + elif isinstance(val, bytes) and val: + img_bytes = val + ext = "png" + else: + img_bytes = None + ext = "png" + if img_bytes: fname = f"images/{col}_{i}.{ext}" zf.writestr(fname, img_bytes) image_filenames[col].append(fname) diff --git a/DashAI/back/api/api_v1/endpoints/jobs.py b/DashAI/back/api/api_v1/endpoints/jobs.py index 715a24783..cebccad2a 100644 --- a/DashAI/back/api/api_v1/endpoints/jobs.py +++ b/DashAI/back/api/api_v1/endpoints/jobs.py @@ -235,6 +235,32 @@ async def enqueue_job( kwargs = json.loads(kwargs_str) + # Handle image files for manual predictions + import re + + manual_input = kwargs.get("manual_input_data") + if manual_input and isinstance(manual_input, list): + file_key_regex = re.compile(r"^file_(\d+)_(.+)$") + temp_dir = tempfile.mkdtemp() + for field_name in form: + upload = form[field_name] + if not hasattr(upload, "read"): + continue + match = file_key_regex.match(field_name) + if not match: + continue + row_idx = int(match.group(1)) + col_name = match.group(2) + if row_idx < 0 or row_idx >= len(manual_input): + continue + if manual_input[row_idx].get(col_name) != field_name: + continue + file_bytes = await upload.read() + file_path = os.path.join(temp_dir, f"{row_idx}_{col_name}") + with open(file_path, "wb") as f: + f.write(file_bytes) + manual_input[row_idx][col_name] = {"__image_file__": file_path} + # instantiate job with only primitive args JobClass = component_registry[job_type]["class"] job = JobClass(**kwargs) diff --git a/DashAI/back/job/predict_job.py b/DashAI/back/job/predict_job.py index 54ca2ac31..ddb904823 100644 --- a/DashAI/back/job/predict_job.py +++ b/DashAI/back/job/predict_job.py @@ -47,12 +47,21 @@ def _build_preview_rows( y_pred: Any, ) -> Tuple[List[str], List[List]]: """Build JSON-safe tabular rows for manual preview responses.""" + import base64 + columns = list(input_columns) + [output_col] rows: List[List] = [] input_data = prepared_dataset.to_dict() for i in range(len(y_pred)): - row = [input_data[col][i] for col in input_columns] + row = [] + for col in input_columns: + val = input_data[col][i] + if isinstance(val, bytes): + val = "data:image/png;base64," + base64.b64encode(val).decode() + elif isinstance(val, dict) and "bytes" in val: + val = "data:image/png;base64," + base64.b64encode(val["bytes"]).decode() + row.append(val) row.append(y_pred[i]) rows.append(row) @@ -177,6 +186,7 @@ def run_manual_prediction( model_session=model_session, ) except (ValueError, TypeError) as e: + logging.exception("Manual prediction input error: %s", e) raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid input data: {str(e)}", diff --git a/DashAI/back/models/mlp_image_classifier.py b/DashAI/back/models/mlp_image_classifier.py index b343247cc..5ba66acea 100644 --- a/DashAI/back/models/mlp_image_classifier.py +++ b/DashAI/back/models/mlp_image_classifier.py @@ -105,11 +105,12 @@ def _get_pil_image(img_data): from PIL import Image if isinstance(img_data, dict) and "bytes" in img_data: - # Image stored as bytes buffer = io.BytesIO(img_data["bytes"]) return Image.open(buffer) + elif isinstance(img_data, bytes): + buffer = io.BytesIO(img_data) + return Image.open(buffer) elif hasattr(img_data, "format"): - # Already a PIL.Image return img_data else: raise TypeError(f"Unsupported image data type: {type(img_data)}") diff --git a/DashAI/back/tasks/base_task.py b/DashAI/back/tasks/base_task.py index b2e88d9b7..04d60f99c 100644 --- a/DashAI/back/tasks/base_task.py +++ b/DashAI/back/tasks/base_task.py @@ -394,12 +394,26 @@ def process_manual_input( if isinstance(value, UploadFile): file_bytes = value.file.read() data, detected_type = get_bytes_with_type_filetype(file_bytes) + expected_type = column_spec.get("type", "") + if detected_type.lower() != expected_type.lower(): + raise TypeError( + f"Row {row_idx}, column '{col_name}': " + f"File type '{detected_type}' doesn't match " + f"expected type '{expected_type}'" + ) + row[col_name] = data - if detected_type != column_spec.get("type"): + # File saved to disk by job queue + elif isinstance(value, dict) and "__image_file__" in value: + with open(value["__image_file__"], "rb") as f: + file_bytes = f.read() + data, detected_type = get_bytes_with_type_filetype(file_bytes) + expected_type = column_spec.get("type", "") + if detected_type.lower() != expected_type.lower(): raise TypeError( f"Row {row_idx}, column '{col_name}': " f"File type '{detected_type}' doesn't match " - f"expected type '{column_spec.get('type')}'" + f"expected type '{expected_type}'" ) row[col_name] = data diff --git a/DashAI/front/src/api/job.ts b/DashAI/front/src/api/job.ts index ef46e427b..b4018e30f 100644 --- a/DashAI/front/src/api/job.ts +++ b/DashAI/front/src/api/job.ts @@ -130,7 +130,9 @@ export const enqueuePredictionJob = async ( const cleanObj: any = {}; Object.entries(obj).forEach(([key, value]) => { if (value instanceof File) { - formData.append(`file_${i}_${key}`, value); // attach file + const fieldName = `file_${i}_${key}`; + formData.append(fieldName, value); + cleanObj[key] = fieldName; } else { cleanObj[key] = value; } diff --git a/DashAI/front/src/components/models/ManualPredictionPanel.jsx b/DashAI/front/src/components/models/ManualPredictionPanel.jsx index 76c057dd9..cec4b0b8e 100644 --- a/DashAI/front/src/components/models/ManualPredictionPanel.jsx +++ b/DashAI/front/src/components/models/ManualPredictionPanel.jsx @@ -319,9 +319,23 @@ export default function ManualPredictionPanel({ {row.map((cell, cellIdx) => ( - {cell !== null && cell !== undefined - ? String(cell) - : "—"} + {typeof cell === "string" && + cell.startsWith("data:image/") ? ( + img + ) : cell !== null && cell !== undefined ? ( + String(cell) + ) : ( + "—" + )} ))} diff --git a/DashAI/front/src/components/predictions/InputField.jsx b/DashAI/front/src/components/predictions/InputField.jsx index b5db318f8..de502d578 100644 --- a/DashAI/front/src/components/predictions/InputField.jsx +++ b/DashAI/front/src/components/predictions/InputField.jsx @@ -1,5 +1,12 @@ import React from "react"; -import { TextField, Select, MenuItem, FormControl } from "@mui/material"; +import { + TextField, + Select, + MenuItem, + FormControl, + Box, + Button, +} from "@mui/material"; import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; @@ -113,6 +120,50 @@ function InputField({ ); } + if (effectiveType === "Image" || dtype === "image") { + return ( + + {value instanceof File && ( + preview + )} + + + ); + } + if ( effectiveType === "Text" || effectiveType === "string" || @@ -130,29 +181,6 @@ function InputField({ ); } - if (effectiveType === "Image" || dtype === "image") { - return ( - - handleChange(rowIndex, col, e.target.files?.[0])} - style={{ - fontSize: "0.875rem", - color: theme.palette.text.primary, - padding: "4px 0", - }} - /> - - ); - } - return ( { const typeInfo = types[col]; - if ( + if (typeInfo?.type === "Image") { + row[col] = null; + } else if ( typeInfo?.type === "Categorical" && typeInfo?.categories?.length > 0 ) { diff --git a/DashAI/front/src/components/predictions/PredictionModal.jsx b/DashAI/front/src/components/predictions/PredictionModal.jsx index 1fdd4dc5c..ba83d16ec 100644 --- a/DashAI/front/src/components/predictions/PredictionModal.jsx +++ b/DashAI/front/src/components/predictions/PredictionModal.jsx @@ -290,7 +290,16 @@ export default function PredictionModal({ isOpen, onClose, run }) { if (predictionMode === "dataset") { return selectedDataset !== null; } - return manualRows && manualRows.length > 0; + if (!manualRows || manualRows.length === 0) return false; + const imageColumns = Object.keys(types).filter( + (col) => types[col]?.type === "Image", + ); + if (imageColumns.length > 0) { + return manualRows.every((row) => + imageColumns.every((col) => row[col] instanceof File), + ); + } + return true; }; if (!isOpen || !run) return null; diff --git a/DashAI/front/src/utils/i18n/locales/en/prediction.json b/DashAI/front/src/utils/i18n/locales/en/prediction.json index f3ed639d8..e1868e2bc 100644 --- a/DashAI/front/src/utils/i18n/locales/en/prediction.json +++ b/DashAI/front/src/utils/i18n/locales/en/prediction.json @@ -57,6 +57,8 @@ "reviewDetails": "Review your prediction configuration before submitting", "runFirstPrediction": "Run your first prediction to see it here", "selectCategory": "Select a category", + "uploadImage": "Upload image", + "changeImage": "Change image", "selectDataset": "Select a dataset for prediction", "selectDatasetFromPlatform": "Select a dataset from platform", "selectMode": "Select Mode", diff --git a/DashAI/front/src/utils/i18n/locales/es/prediction.json b/DashAI/front/src/utils/i18n/locales/es/prediction.json index 6287225e0..d14c4ccb5 100644 --- a/DashAI/front/src/utils/i18n/locales/es/prediction.json +++ b/DashAI/front/src/utils/i18n/locales/es/prediction.json @@ -57,6 +57,8 @@ "reviewDetails": "Revise la configuración de su predicción antes de enviar", "runFirstPrediction": "Ejecute su primera predicción para verla aquí", "selectCategory": "Selecciona una categoría", + "uploadImage": "Subir imagen", + "changeImage": "Cambiar imagen", "selectDataset": "Seleccione un dataset para la predicción", "selectDatasetFromPlatform": "Seleccione un dataset de la plataforma", "selectMode": "Seleccionar Modo", From c12068f6fd7c186e2f9b6489f58966eb96c886eb Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 15:30:48 -0400 Subject: [PATCH 010/361] feat: add useEffect to synchronize input value with prop changes in ArrayInput component --- .../components/configurableObject/Inputs/ArrayInput.jsx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/DashAI/front/src/components/configurableObject/Inputs/ArrayInput.jsx b/DashAI/front/src/components/configurableObject/Inputs/ArrayInput.jsx index 436b723c5..46be6e56e 100644 --- a/DashAI/front/src/components/configurableObject/Inputs/ArrayInput.jsx +++ b/DashAI/front/src/components/configurableObject/Inputs/ArrayInput.jsx @@ -1,4 +1,4 @@ -import React, { useState } from "react"; +import React, { useState, useEffect } from "react"; import PropTypes from "prop-types"; import InputWithDebounce from "../../shared/InputWithDebounce"; import { FormControl } from "@mui/material"; @@ -16,6 +16,12 @@ function ArrayInput({ }) { const [inputValue, setInputValue] = useState(value.join(",")); + useEffect(() => { + if (Array.isArray(value)) { + setInputValue(value.join(",")); + } + }, [value]); + const convertValue = (val) => { switch (itemType) { case "integer": From 8a9159324b2ee205176ee9dfcd899c16dca5e68e Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 21 Apr 2026 15:52:27 -0400 Subject: [PATCH 011/361] feat: update icon for MLPImageClassifier and add ImageClassificationTask icon in ModelCenterContent --- DashAI/back/models/mlp_image_classifier.py | 2 +- DashAI/front/src/components/models/ModelCenterContent.jsx | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DashAI/back/models/mlp_image_classifier.py b/DashAI/back/models/mlp_image_classifier.py index 5ba66acea..b108a8c5f 100644 --- a/DashAI/back/models/mlp_image_classifier.py +++ b/DashAI/back/models/mlp_image_classifier.py @@ -185,7 +185,7 @@ class MLPImageClassifier(BaseModel): ), ) COLOR: str = "#E91E63" - ICON: str = "Image" + ICON: str = "ImageSearch" @staticmethod def _collate_fn_with_labels(batch): diff --git a/DashAI/front/src/components/models/ModelCenterContent.jsx b/DashAI/front/src/components/models/ModelCenterContent.jsx index c66b0671d..794f845bc 100644 --- a/DashAI/front/src/components/models/ModelCenterContent.jsx +++ b/DashAI/front/src/components/models/ModelCenterContent.jsx @@ -11,6 +11,7 @@ import { TextFields as TextClassificationIcon, TableChart as TabularClassificationIcon, Translate as TranslationIcon, + ImageSearch as ImageClassificationIcon, Science as DefaultTaskIcon, } from "@mui/icons-material"; @@ -18,6 +19,7 @@ const TASK_ICONS = { ClassificationTask: ClassificationIcon, TabularClassificationTask: TabularClassificationIcon, TextClassificationTask: TextClassificationIcon, + ImageClassificationTask: ImageClassificationIcon, RegressionTask: RegressionIcon, TranslationTask: TranslationIcon, }; From 461780c8e15af1c7b0016aa35ca224d3cd49d242 Mon Sep 17 00:00:00 2001 From: Creylay Date: Thu, 23 Apr 2026 10:46:07 -0400 Subject: [PATCH 012/361] feat: enhance image loading and processing in dataset handling, improve schema integration --- DashAI/back/api/api_v1/endpoints/datasets.py | 104 ++++++++--- .../dataloaders/classes/dashai_dataset.py | 108 +++++++----- .../dataloaders/classes/image_dataloader.py | 164 +++++++++++------- 3 files changed, 239 insertions(+), 137 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index adedd6f0a..52cce5a0d 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -32,29 +32,57 @@ def _build_image_preview_sample( extract_dir: str, image_extensions: set, max_rows: int = 5 ) -> tuple: - """Walk an extracted imagefolder directory and return sample rows - and total count.""" + """Walk an extracted imagefolder directory and return sample rows, + total count, and whether class subdirectories exist. + + Returns + ------- + tuple of (list[dict], int, bool) + (sample_rows, total_images, has_labels) + """ import os samples = [] total = 0 - for root, _, files in os.walk(extract_dir): + labels_seen = set() + for root, dirs, files in os.walk(extract_dir): + dirs[:] = [ + d for d in dirs if not d.startswith("__MACOSX") and not d.startswith(".") + ] for f in sorted(files): + if f.startswith("."): + continue ext = os.path.splitext(f)[1].lower() if ext not in image_extensions: continue total += 1 + label = os.path.basename(root) + labels_seen.add(label) if len(samples) >= max_rows: continue filepath = os.path.join(root, f) - label = os.path.basename(root) try: with open(filepath, "rb") as fh: - thumb = _image_bytes_to_thumbnail_data_uri(fh.read()) - samples.append({"image": thumb, "label": label}) + img_bytes = fh.read() + if len(img_bytes) == 0: + continue + thumb = _image_bytes_to_thumbnail_data_uri(img_bytes) + if thumb == "[Image]": + continue + samples.append({"image": thumb, "_label": label}) except Exception: continue - return samples, total + + has_labels = len(labels_seen) > 1 + + if has_labels: + for s in samples: + s["label"] = s.pop("_label") + else: + for s in samples: + s.pop("_label", None) + + return samples, total, has_labels def _image_bytes_to_thumbnail_data_uri(img_bytes: bytes, max_size: int = 64) -> str: @@ -66,12 +94,22 @@ def _image_bytes_to_thumbnail_data_uri(img_bytes: bytes, max_size: int = 64) -> try: img = Image.open(io.BytesIO(img_bytes)) + if img.mode in ("CMYK", "YCbCr", "LAB", "HSV"): + img = img.convert("RGB") + elif img.mode in ("LA", "PA"): + img = img.convert("RGBA") img.thumbnail((max_size, max_size)) buf = io.BytesIO() img.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() return f"data:image/png;base64,{b64}" - except Exception: + except Exception as e: + logger.warning( + "Failed to generate thumbnail (len=%d, head=%r): %s", + len(img_bytes), + img_bytes[:16], + e, + ) return "[Image]" @@ -1900,8 +1938,15 @@ async def preview_with_types( dataloader_name = None matched_file = None has_images = False - for root, _, files in os.walk(extract_dir): + for root, dirs, files in os.walk(extract_dir): + dirs[:] = [ + d + for d in dirs + if not d.startswith("__MACOSX") and not d.startswith(".") + ] for f in files: + if f.startswith("."): + continue ext = os.path.splitext(f)[1].lower() if ext in supported_map: dataloader_name = supported_map[ext] @@ -1913,28 +1958,35 @@ async def preview_with_types( break if dataloader_name is None and has_images: - sample_rows, total_images = _build_image_preview_sample( - extract_dir, image_extensions, max_rows=5 + sample_rows, total_images, has_labels = ( + _build_image_preview_sample( + extract_dir, image_extensions, max_rows=5 + ) ) shutil.rmtree(extract_dir, ignore_errors=True) os.unlink(tmp_file_path) + + schema = { + "image": {"type": "Image", "dtype": "string"}, + } + inferred_types = { + "image": {"type": "Image", "dtype": "string"}, + } + if has_labels: + schema["label"] = { + "type": "Categorical", + "dtype": "string", + } + inferred_types["label"] = { + "type": "Categorical", + "dtype": "string", + "encoder": "one_hot", + } + return { "sample": sample_rows, - "schema": { - "image": {"type": "Image", "dtype": "string"}, - "label": { - "type": "Categorical", - "dtype": "string", - }, - }, - "inferred_types": { - "image": {"type": "Image", "dtype": "string"}, - "label": { - "type": "Categorical", - "dtype": "string", - "encoder": "one_hot", - }, - }, + "schema": schema, + "inferred_types": inferred_types, "preview_row_count": total_images, "types_inferred": False, } diff --git a/DashAI/back/dataloaders/classes/dashai_dataset.py b/DashAI/back/dataloaders/classes/dashai_dataset.py index c94dc49dc..82f33f6de 100644 --- a/DashAI/back/dataloaders/classes/dashai_dataset.py +++ b/DashAI/back/dataloaders/classes/dashai_dataset.py @@ -773,60 +773,72 @@ def transform_dataset_with_schema( my_schema = pa.schema([]) dashai_types = {} - for column_name, info in schema.items(): - # Skip columns that don't exist in the dataset - if column_name not in dataset.column_names: - continue - - _type = info.get("type") - dtype = info.get("dtype") - pa_type = to_arrow_types(dtype) - if _type == "Categorical": - base_col = table.column(column_name) - converted = info.get("converted", False) - - # Always infer categories from actual data to ensure all values are - # included. Type inference (ptype) excludes anomalous values from - # its suggested categories, which causes KeyErrors during training - # when those "anomalous" values appear in the data. - col_list = base_col.to_pylist() - categories = sorted({v for v in col_list if v is not None}) - - encoder = info.get("encoder", "one_hot") - dashai_types[column_name] = Categorical( - values=categories, converted=converted, dtype=dtype, encoder=encoder - ) - # Keep the column data as-is without converting to string - dai_table[column_name] = base_col - # Use the dtype from schema for pa_type - pa_type = to_arrow_types(dtype) - elif _type == "Image": - from DashAI.back.types.dashai_image import DashAIImage - - dashai_types[column_name] = DashAIImage(dtype=dtype) - dai_table[column_name] = table.column(column_name) + # First, include all columns from the dataset in the order they appear + for column_name in dataset.column_names: + if column_name not in schema: + # Column not in schema - preserve it as-is with inferred type + col_data = table.column(column_name) + dai_table[column_name] = col_data pa_type = table.schema.field(column_name).type + my_schema = my_schema.append(pa.field(column_name, pa_type)) + + # Infer the DashAI type from Arrow type + dashai_types[column_name] = arrow_to_dashai_types(pa_type) else: - if _type in ["Date", "Time", "Timestamp"]: - # Since DashAI is not using date, time or timestamp types for its models - # we are saving them as strings to preserve the original format. - # Can modify classes in value_types.py - # if want to use PyArrow date, time or timestamp types. - dashai_types[column_name] = arrow_to_dashai_types( - arrow_type=_type, format=dtype + # Column is in schema - process according to schema definition + info = schema[column_name] + _type = info.get("type") + dtype = info.get("dtype") + pa_type = to_arrow_types(dtype) + if _type == "Categorical": + base_col = table.column(column_name) + converted = info.get("converted", False) + + # Always infer categories from actual data to ensure all values are + # included. Type inference (ptype) excludes anomalous values from + # its suggested categories, which causes KeyErrors during training + # when those "anomalous" values appear in the data. + col_list = base_col.to_pylist() + categories = sorted({v for v in col_list if v is not None}) + + encoder = info.get("encoder", "one_hot") + dashai_types[column_name] = Categorical( + values=categories, converted=converted, dtype=dtype, encoder=encoder ) - pa_type = to_arrow_types("string") + # Keep the column data as-is without converting to string + dai_table[column_name] = base_col + # Use the dtype from schema for pa_type + pa_type = to_arrow_types(dtype) + elif _type == "Image": + from DashAI.back.types.dashai_image import DashAIImage + + dashai_types[column_name] = DashAIImage(dtype=dtype) dai_table[column_name] = table.column(column_name) - - elif _type == "Float": - dashai_types[column_name] = arrow_to_dashai_types(pa_type) - dai_table[column_name] = comma_float_to_float(table.column(column_name)) - + pa_type = table.schema.field(column_name).type else: - dashai_types[column_name] = arrow_to_dashai_types(pa_type) - dai_table[column_name] = table.column(column_name) + if _type in ["Date", "Time", "Timestamp"]: + # Since DashAI is not using date, time or timestamp types for its + # models + # we are saving them as strings to preserve the original format. + # Can modify classes in value_types.py + # if want to use PyArrow date, time or timestamp types. + dashai_types[column_name] = arrow_to_dashai_types( + arrow_type=_type, format=dtype + ) + pa_type = to_arrow_types("string") + dai_table[column_name] = table.column(column_name) + + elif _type == "Float": + dashai_types[column_name] = arrow_to_dashai_types(pa_type) + dai_table[column_name] = comma_float_to_float( + table.column(column_name) + ) + + else: + dashai_types[column_name] = arrow_to_dashai_types(pa_type) + dai_table[column_name] = table.column(column_name) - my_schema = my_schema.append(pa.field(column_name, pa_type)) + my_schema = my_schema.append(pa.field(column_name, pa_type)) # Create the transformed table with the new schema transformed_table = pa.table(dai_table) diff --git a/DashAI/back/dataloaders/classes/image_dataloader.py b/DashAI/back/dataloaders/classes/image_dataloader.py index d6a54e242..c50e72d4e 100644 --- a/DashAI/back/dataloaders/classes/image_dataloader.py +++ b/DashAI/back/dataloaders/classes/image_dataloader.py @@ -4,7 +4,7 @@ from typing import Any, Dict from beartype import beartype -from datasets import Dataset, IterableDatasetDict, load_dataset +from datasets import Dataset from DashAI.back.core.schema_fields import none_type, schema_field, string_field from DashAI.back.core.schema_fields.base_schema import BaseSchema @@ -27,6 +27,89 @@ class ImageDataLoaderSchema(BaseSchema): ) # type: ignore +IMAGE_EXTENSIONS = { + ".png", + ".jpg", + ".jpeg", + ".bmp", + ".gif", + ".tiff", + ".webp", +} + + +def _find_imagefolder_root(base_path: str) -> str: + """Descend into single-child directories until we find the level + that contains the class subdirectories.""" + import os + + while True: + entries = [ + e + for e in os.listdir(base_path) + if not e.startswith(".") and e != "__MACOSX" + ] + if len(entries) == 1 and os.path.isdir(os.path.join(base_path, entries[0])): + base_path = os.path.join(base_path, entries[0]) + else: + break + return base_path + + +def _load_images_from_directory(data_dir: str, n_sample=None): + """Walk the directory structure and build a list of dicts with + 'image' (bytes+format) and 'label' (parent folder name) entries. + + This replaces HF's imagefolder loader to guarantee label detection. + """ + import io + import os + + from PIL import Image as PILImage + + records = [] + entries = [ + e + for e in sorted(os.listdir(data_dir)) + if not e.startswith(".") and e != "__MACOSX" + ] + + for class_name in entries: + class_path = os.path.join(data_dir, class_name) + if not os.path.isdir(class_path): + continue + for root, _dirs, files in os.walk(class_path): + for fname in sorted(files): + ext = os.path.splitext(fname)[1].lower() + if ext not in IMAGE_EXTENSIONS: + continue + fpath = os.path.join(root, fname) + try: + img = PILImage.open(fpath) + img.load() + if img.mode in ("CMYK", "YCbCr", "LAB", "HSV"): + img = img.convert("RGB") + elif img.mode in ("LA", "PA"): + img = img.convert("RGBA") + buf = io.BytesIO() + fmt = img.format or "PNG" + img.save(buf, format=fmt) + records.append( + { + "image": { + "bytes": buf.getvalue(), + "format": fmt, + }, + "label": class_name, + } + ) + except Exception: + continue + if n_sample and len(records) >= n_sample: + return records + return records + + class ImageDataLoader(BaseDataLoader): """Data loader for image datasets. @@ -82,7 +165,10 @@ def load_data( DashAIDataset A DashAI Dataset with the loaded image data. """ - import io + import logging + import os + + log = logging.getLogger(__name__) prepared_path = self.prepare_files(filepath_or_buffer, temp_path) @@ -91,81 +177,33 @@ def load_data( "The image dataloader requires the input file to be a zip file." ) - dataset = load_dataset( - "imagefolder", - data_dir=prepared_path[0], - streaming=bool(n_sample), - cache_dir=temp_path, + data_dir = _find_imagefolder_root(prepared_path[0]) + log.debug("Resolved data_dir: %s", data_dir) + log.debug( + "data_dir contents: %s", + [e for e in os.listdir(data_dir) if not e.startswith(".")], ) - if n_sample: - if isinstance(dataset, IterableDatasetDict): - dataset = dataset["train"] - dataset = Dataset.from_list(list(dataset.take(n_sample))) + records = _load_images_from_directory(data_dir, n_sample) + log.debug("Loaded %d images from directory", len(records)) - def convert_image_to_bytes(example): - buffer = io.BytesIO() - img_format = example["image"].format or "PNG" - example["image"].save(buffer, format=img_format) - return {"image": {"bytes": buffer.getvalue(), "format": img_format}} + if not records: + raise ValueError("No images found in the uploaded zip file.") - dataset = dataset.map(convert_image_to_bytes) - - # Convert ClassLabel columns (integers) back to their string names. - # HF ClassLabel silently casts map() outputs back to int, so we need - # to build a new dataset with the strings directly. - from datasets import ClassLabel as HFClassLabel - from datasets import Features, Value - - if isinstance(dataset, Dataset): - ds_ref = dataset - else: - first_key = list(dataset.keys())[0] - ds_ref = dataset[first_key] - - classlabel_cols = {} - for col in ds_ref.column_names: - feat = ds_ref.features.get(col) - if isinstance(feat, HFClassLabel): - classlabel_cols[col] = feat.names - - if classlabel_cols: - new_features = Features( - { - col: Value("string") if col in classlabel_cols else feat - for col, feat in ds_ref.features.items() - } - ) - - def convert_labels(example): - for col, names in classlabel_cols.items(): - example[col] = names[example[col]] - return example - - if isinstance(dataset, Dataset): - dataset = dataset.map(convert_labels, features=new_features) - else: - for split_name in list(dataset.keys()): - dataset[split_name] = dataset[split_name].map( - convert_labels, features=new_features - ) - ds_ref = dataset[first_key] + dataset = Dataset.from_list(records) + log.debug("Dataset columns: %s", dataset.column_names) shutil.rmtree(prepared_path[0]) from DashAI.back.types.categorical import Categorical from DashAI.back.types.dashai_image import DashAIImage - ds_for_types = dataset if isinstance(dataset, Dataset) else ds_ref - types = {} - for col in ds_for_types.column_names: + for col in dataset.column_names: if col == "image": types[col] = DashAIImage() else: - unique_vals = sorted( - {str(v) for v in ds_for_types[col] if v is not None} - ) + unique_vals = sorted({str(v) for v in dataset[col] if v is not None}) types[col] = Categorical(values=unique_vals, dtype="string") return to_dashai_dataset(dataset, types=types) From a9c6bc30dbb051fee3fe8d7e67dea0ba2cf0c7ef Mon Sep 17 00:00:00 2001 From: Creylay Date: Fri, 24 Apr 2026 13:56:20 -0400 Subject: [PATCH 013/361] feat: update DashAIImage dtype to 'struct' and adjust related type lists for consistency --- DashAI/back/types/dashai_image.py | 15 ++++----------- DashAI/back/types/utils.py | 2 +- DashAI/front/src/utils/typesLists.js | 3 ++- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/DashAI/back/types/dashai_image.py b/DashAI/back/types/dashai_image.py index 959db4781..775d4fba3 100644 --- a/DashAI/back/types/dashai_image.py +++ b/DashAI/back/types/dashai_image.py @@ -14,22 +14,15 @@ class DashAIImage(DashAIDataType): Attributes ---------- dtype : str - The data type of the image, default is "string" which represents the path to the image. + The data type of the image, default is "struct" (Arrow struct). base_path : Optional[str] - An optional base path for images, useful if images are represented just by their filenames. + An optional base path for images. """ - dtype: str = "string" # Default: Path to image (str) - #### - # Optional base path in case images are represented just by their filenames - # Since Dataloaders are not something I'm working on, - # this was done in the basis that the final image dataloader - # will contain an optional parameter to specify a base path for images - # (If they are in the same folder, for example). + dtype: str = "struct" base_path: Optional[str] = None - #### - def __init__(self, dtype: str = "string"): + def __init__(self, dtype: str = "struct"): self.dtype = dtype def to_string(self): diff --git a/DashAI/back/types/utils.py b/DashAI/back/types/utils.py index 86dfd87a6..5611a488a 100644 --- a/DashAI/back/types/utils.py +++ b/DashAI/back/types/utils.py @@ -228,7 +228,7 @@ def get_types_from_arrow_metadata( elif _type == "Image": from DashAI.back.types.dashai_image import DashAIImage - dtype = info.get("dtype", "string") + dtype = info.get("dtype", "struct") dashai_types[column] = DashAIImage(dtype=dtype) else: dtype = info.get("dtype") diff --git a/DashAI/front/src/utils/typesLists.js b/DashAI/front/src/utils/typesLists.js index c597cb09f..a2570a9e4 100644 --- a/DashAI/front/src/utils/typesLists.js +++ b/DashAI/front/src/utils/typesLists.js @@ -26,6 +26,7 @@ export const dataTypesList = [ "large_binary", "string", "large_string", + "struct", ]; export const columnTypesList = [ @@ -63,5 +64,5 @@ export const dataTypesbyColumnType = { Date: ["date32", "date64"], Binary: ["binary", "large_binary"], Categorical: ["string"], - Image: ["string", "binary", "large_binary"], + Image: ["struct"], }; From 423f7c58e251612727746805ceac8c0448c569bd Mon Sep 17 00:00:00 2001 From: Creylay Date: Sat, 2 May 2026 12:04:28 -0400 Subject: [PATCH 014/361] fix: update inferred_types check and correct image dtype documentation --- DashAI/back/job/dataset_job.py | 3 +-- DashAI/back/types/dashai_image.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 4d2787a67..9d591d0c8 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -174,7 +174,7 @@ def run( n_sample=n_sample, ) - if "inferred_types" in params: + if params.get("inferred_types"): schema = params["inferred_types"] elif new_dataset.types: schema = { @@ -185,7 +185,6 @@ def run( schema = infer_types( new_dataset.to_pandas(), method="DashAIPtype" ) - if "column_renames" in params: renames = params["column_renames"] original_names = new_dataset.arrow_table.schema.names diff --git a/DashAI/back/types/dashai_image.py b/DashAI/back/types/dashai_image.py index 775d4fba3..0b277ecf7 100644 --- a/DashAI/back/types/dashai_image.py +++ b/DashAI/back/types/dashai_image.py @@ -14,7 +14,7 @@ class DashAIImage(DashAIDataType): Attributes ---------- dtype : str - The data type of the image, default is "struct" (Arrow struct). + The data type of the image, default is "struct" (Arrow struct). base_path : Optional[str] An optional base path for images. """ From 78ff4be3730903a755b1848f2d99a9e35cd7d3c8 Mon Sep 17 00:00:00 2001 From: Creylay Date: Sat, 2 May 2026 12:04:59 -0400 Subject: [PATCH 015/361] fix: update image loading to store file path instead of format --- DashAI/back/api/api_v1/endpoints/datasets.py | 24 ++++++++++++------- .../dataloaders/classes/image_dataloader.py | 2 +- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 52cce5a0d..185c009dc 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -1605,8 +1605,12 @@ async def export_dataset_as_csv( val = arr_col[i].as_py() if isinstance(val, dict) and val.get("bytes"): img_bytes = val["bytes"] - fmt = (val.get("format", "PNG") or "PNG").lower() - ext = "jpg" if fmt == "jpeg" else fmt + raw_ext = ( + os.path.splitext(val.get("path") or "")[1] + .lstrip(".") + .lower() + ) + ext = raw_ext if raw_ext else "png" elif isinstance(val, bytes) and val: img_bytes = val ext = "png" @@ -1783,10 +1787,14 @@ async def export_dataset_csv_by_id( struct_val = struct_col[i].as_py() if struct_val and struct_val.get("bytes"): img_bytes = struct_val["bytes"] - fmt = ( - struct_val.get("format", "PNG") or "PNG" - ).lower() - ext = "jpg" if fmt == "jpeg" else fmt + raw_ext = ( + os.path.splitext(struct_val.get("path") or "")[ + 1 + ] + .lstrip(".") + .lower() + ) + ext = raw_ext if raw_ext else "png" fname = f"images/{col}_{i}.{ext}" zf.writestr(fname, img_bytes) image_filenames[col].append(fname) @@ -1967,10 +1975,10 @@ async def preview_with_types( os.unlink(tmp_file_path) schema = { - "image": {"type": "Image", "dtype": "string"}, + "image": {"type": "Image", "dtype": "struct"}, } inferred_types = { - "image": {"type": "Image", "dtype": "string"}, + "image": {"type": "Image", "dtype": "struct"}, } if has_labels: schema["label"] = { diff --git a/DashAI/back/dataloaders/classes/image_dataloader.py b/DashAI/back/dataloaders/classes/image_dataloader.py index c50e72d4e..7a939c1b4 100644 --- a/DashAI/back/dataloaders/classes/image_dataloader.py +++ b/DashAI/back/dataloaders/classes/image_dataloader.py @@ -98,7 +98,7 @@ def _load_images_from_directory(data_dir: str, n_sample=None): { "image": { "bytes": buf.getvalue(), - "format": fmt, + "path": fname, }, "label": class_name, } From 43bc91e30b0d3c7864c4de72cb9975b8bc25a5dc Mon Sep 17 00:00:00 2001 From: Creylay Date: Mon, 4 May 2026 00:10:10 -0400 Subject: [PATCH 016/361] feat: enhance image handling in DashAIDataset and MLPImageClassifier --- .../dataloaders/classes/dashai_dataset.py | 31 +++++++++ DashAI/back/models/mlp_image_classifier.py | 63 +++++------------- DashAI/back/types/dashai_image.py | 65 +++++++++++-------- 3 files changed, 84 insertions(+), 75 deletions(-) diff --git a/DashAI/back/dataloaders/classes/dashai_dataset.py b/DashAI/back/dataloaders/classes/dashai_dataset.py index 82f33f6de..82d96b2c9 100644 --- a/DashAI/back/dataloaders/classes/dashai_dataset.py +++ b/DashAI/back/dataloaders/classes/dashai_dataset.py @@ -665,6 +665,37 @@ def select_columns(self, column_names: Union[str, List[str]]) -> "DashAIDataset" return DashAIDataset(table=subset_table, splits=self.splits, types=subset_types) + def __getitem__(self, key): + result = super().__getitem__(key) + if not isinstance(result, dict): + return result + + from DashAI.back.types.dashai_image import DashAIImage + + image_cols = [ + c + for c, t in self._types.items() + if isinstance(t, DashAIImage) and c in result + ] + if not image_cols: + return result + + if isinstance(key, int): + for col in image_cols: + if isinstance(result[col], dict): + result[col] = DashAIImage( + bytes=result[col].get("bytes"), path=result[col].get("path") + ) + elif isinstance(key, slice): + for col in image_cols: + result[col] = [ + DashAIImage(bytes=v.get("bytes"), path=v.get("path")) + if isinstance(v, dict) + else v + for v in result[col] + ] + return result + @beartype def select(self, *args, **kwargs) -> "DashAIDataset": """ diff --git a/DashAI/back/models/mlp_image_classifier.py b/DashAI/back/models/mlp_image_classifier.py index b108a8c5f..70775ea6a 100644 --- a/DashAI/back/models/mlp_image_classifier.py +++ b/DashAI/back/models/mlp_image_classifier.py @@ -1,6 +1,5 @@ """MLP-based image classifier for DashAI.""" -import datasets import torch import torch.nn as nn import torch.optim as optim @@ -68,11 +67,11 @@ class MLPImageClassifierSchema(BaseSchema): class _ImageDataset(torch.utils.data.Dataset): - """Torch Dataset wrapper for HuggingFace image datasets.""" + """Torch Dataset wrapper for DashAI image datasets.""" - def __init__(self, hf_dataset: datasets.Dataset, has_labels: bool = True): - self.dataset = hf_dataset - self.has_labels = has_labels + def __init__(self, x_dataset, y_dataset=None): + self.x_dataset = x_dataset + self.y_dataset = y_dataset self.transforms = transforms.Compose( [ transforms.Resize((30, 30)), @@ -80,40 +79,21 @@ def __init__(self, hf_dataset: datasets.Dataset, has_labels: bool = True): ] ) - column_names = list(self.dataset.features.keys()) - self.image_col_name = column_names[0] + self.image_col_name = list(x_dataset.features.keys())[0] self.label_col_name = ( - column_names[1] if has_labels and len(column_names) > 1 else None + list(y_dataset.features.keys())[0] if y_dataset is not None else None ) - # Create label to index mapping if labels exist self.label_to_idx = {} self.idx_to_label = {} if self.label_col_name: - unique_labels = sorted(set(self.dataset[self.label_col_name])) + unique_labels = sorted(set(self.y_dataset[self.label_col_name])) self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)} self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()} - pil_image = self._get_pil_image(self.dataset[0][self.image_col_name]) - self.tensor_shape = self.transforms(pil_image).shape - - @staticmethod - def _get_pil_image(img_data): - """Convert image data (dict with bytes or PIL.Image) to PIL.Image.""" - import io - - from PIL import Image - - if isinstance(img_data, dict) and "bytes" in img_data: - buffer = io.BytesIO(img_data["bytes"]) - return Image.open(buffer) - elif isinstance(img_data, bytes): - buffer = io.BytesIO(img_data) - return Image.open(buffer) - elif hasattr(img_data, "format"): - return img_data - else: - raise TypeError(f"Unsupported image data type: {type(img_data)}") + self.tensor_shape = self.transforms( + self.x_dataset[0][self.image_col_name].to_pil() + ).shape def num_classes(self): if self.label_col_name is None: @@ -121,15 +101,13 @@ def num_classes(self): return len(self.label_to_idx) def __len__(self): - return len(self.dataset) + return len(self.x_dataset) def __getitem__(self, idx): - pil_image = self._get_pil_image(self.dataset[idx][self.image_col_name]) - image = self.transforms(pil_image) + image = self.transforms(self.x_dataset[idx][self.image_col_name].to_pil()) if self.label_col_name is None: return image - # Convert label string to index - label_str = self.dataset[idx][self.label_col_name] + label_str = self.y_dataset[idx][self.label_col_name] label_idx = self.label_to_idx[label_str] return image, label_idx @@ -247,16 +225,7 @@ def train(self, x_train, y_train, x_validation=None, y_validation=None): MLPImageClassifier The trained model instance. """ - image_col = list(x_train.features.keys())[0] - label_col = list(y_train.features.keys())[0] - - hf_dataset = datasets.Dataset.from_dict( - { - "image": x_train[image_col], - "label": y_train[label_col], - } - ) - image_dataset = _ImageDataset(hf_dataset, has_labels=True) + image_dataset = _ImageDataset(x_train, y_dataset=y_train) self.input_dim = ( image_dataset.tensor_shape[0] @@ -306,9 +275,7 @@ def predict(self, x): list of lists List of predicted probabilities for each class for each image. """ - image_col = list(x.features.keys())[0] - hf_dataset = datasets.Dataset.from_dict({"image": x[image_col]}) - image_dataset = _ImageDataset(hf_dataset, has_labels=False) + image_dataset = _ImageDataset(x, y_dataset=None) test_loader = torch.utils.data.DataLoader( image_dataset, batch_size=32, diff --git a/DashAI/back/types/dashai_image.py b/DashAI/back/types/dashai_image.py index 0b277ecf7..e1e77b815 100644 --- a/DashAI/back/types/dashai_image.py +++ b/DashAI/back/types/dashai_image.py @@ -1,40 +1,51 @@ -# flake8: noqa -# Not implemented yet -from dataclasses import dataclass -from typing import Optional +"""DashAI Image type.""" + +from __future__ import annotations + +import io +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, ClassVar, Optional + +import pyarrow as pa from DashAI.back.types.dashai_data_type import DashAIDataType +if TYPE_CHECKING: + import numpy as np + from PIL import Image as PILImage + @dataclass class DashAIImage(DashAIDataType): + """Image type for DashAI datasets. + + Serves dual roles: + - Column type descriptor: ``DashAIImage()`` — bytes/path are None. + - Data instance returned by ``DashAIDataset.__getitem__``: + ``DashAIImage(bytes=b"...", path="cat.jpg")``. """ - Represents an image data type in DashAI. - - Attributes - ---------- - dtype : str - The data type of the image, default is "struct" (Arrow struct). - base_path : Optional[str] - An optional base path for images. - """ + + pa_type: ClassVar[pa.DataType] = pa.struct( + {"bytes": pa.binary(), "path": pa.string()} + ) dtype: str = "struct" - base_path: Optional[str] = None + bytes: Optional[bytes] = field(default=None, repr=False) + path: Optional[str] = field(default=None, repr=False) - def __init__(self, dtype: str = "struct"): - self.dtype = dtype + def to_string(self) -> dict: + return {"type": "Image", "dtype": self.dtype} - def to_string(self): - """ - Convert the DashAIImage type to a string representation. + def to_pil(self) -> "PILImage": + """Decode image bytes to a PIL Image.""" + from PIL import Image as PILImage - Returns - ------- - dict - A dictionary representation of the DashAIImage type. - """ - if self.base_path: - return {"type": "Image", "dtype": self.dtype, "base_path": self.base_path} + if self.bytes is None: + raise ValueError("No image bytes available.") + return PILImage.open(io.BytesIO(self.bytes)) - return {"type": "Image", "dtype": self.dtype} + def to_numpy(self) -> "np.ndarray": + """Decode image bytes to a NumPy array (H x W x C, uint8).""" + import numpy as np + + return np.array(self.to_pil()) From fda89816f49ad066b317574196dc8ea47e85fce8 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 12:38:20 -0400 Subject: [PATCH 017/361] feat: add BaseDatasetSource and DatasetEntry base classes --- DashAI/back/dataset_sources/__init__.py | 0 .../dataset_sources/base_dataset_source.py | 131 ++++++++++++++++++ tests/back/dataset_sources/__init__.py | 0 .../test_base_dataset_source.py | 98 +++++++++++++ 4 files changed, 229 insertions(+) create mode 100644 DashAI/back/dataset_sources/__init__.py create mode 100644 DashAI/back/dataset_sources/base_dataset_source.py create mode 100644 tests/back/dataset_sources/__init__.py create mode 100644 tests/back/dataset_sources/test_base_dataset_source.py diff --git a/DashAI/back/dataset_sources/__init__.py b/DashAI/back/dataset_sources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/DashAI/back/dataset_sources/base_dataset_source.py b/DashAI/back/dataset_sources/base_dataset_source.py new file mode 100644 index 000000000..981f315a4 --- /dev/null +++ b/DashAI/back/dataset_sources/base_dataset_source.py @@ -0,0 +1,131 @@ +"""Base classes for DashAI dataset sources.""" + +from abc import abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Final + +from DashAI.back.config_object import ConfigObject +from DashAI.back.core.utils import MultilingualString + +if TYPE_CHECKING: + import pandas as pd + + +@dataclass +class DatasetEntry: + """Represents a single dataset retrieved from an external source. + + Parameters + ---------- + id : str + Source-specific unique identifier (e.g. ``"owner/name"`` for HuggingFace). + name : str + Human-readable dataset name. + description : str + Short description of the dataset. + tags : list[str] + List of topic/task tags. + size_bytes : int or None + Total compressed size in bytes, or None if unknown. + row_count : int or None + Number of rows, or None if unknown. + url : str + Link to the dataset page on the source website. + source : str + Class name of the DatasetSource that produced this entry. + """ + + id: str + name: str + description: str + tags: list[str] + size_bytes: int | None + row_count: int | None + url: str + source: str + + +class BaseDatasetSource(ConfigObject): + """Abstract base class for all DashAI dataset sources. + + Subclasses connect to external dataset repositories (HuggingFace Hub, + OpenML, Kaggle, etc.) and expose a uniform interface for searching, + previewing, and downloading datasets. + """ + + TYPE: Final[str] = "DatasetSource" + DISPLAY_NAME: Final = MultilingualString(en="", es="") + DESCRIPTION: Final = MultilingualString(en="", es="") + + @abstractmethod + def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEntry]: + """Return datasets matching a query string. + + Parameters + ---------- + query : str + Free-text search string. + limit : int, optional + Maximum number of results, by default 20. + **filters : Any + Source-specific filter keyword arguments. + + Returns + ------- + list[DatasetEntry] + Matching datasets from this source. + """ + raise NotImplementedError + + @abstractmethod + def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": + """Download a sample of rows without fetching the full dataset. + + Parameters + ---------- + dataset_id : str + Source-specific dataset identifier. + n_rows : int, optional + Number of sample rows to retrieve, by default 100. + + Returns + ------- + pd.DataFrame + Sample rows as a pandas DataFrame. + """ + raise NotImplementedError + + @abstractmethod + def fetch_full(self, dataset_id: str, temp_path: str) -> tuple[str, str]: + """Download the full dataset to a local temp directory. + + Parameters + ---------- + dataset_id : str + Source-specific dataset identifier. + temp_path : str + Local directory path to download into. + + Returns + ------- + tuple[str, str] + ``(local_file_path, dataloader_name)`` — path to the downloaded + file and the DashAI DataLoader class name to use for loading it. + """ + raise NotImplementedError + + @abstractmethod + def get_download_url(self, dataset_id: str) -> str: + """Return a direct URL the browser can use to download the dataset. + + Parameters + ---------- + dataset_id : str + Source-specific dataset identifier. + + Returns + ------- + str + Direct download URL. + """ + raise NotImplementedError diff --git a/tests/back/dataset_sources/__init__.py b/tests/back/dataset_sources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py new file mode 100644 index 000000000..2fed26208 --- /dev/null +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -0,0 +1,98 @@ +"""Tests for BaseDatasetSource and DatasetEntry.""" +import pytest +from dataclasses import asdict +from DashAI.back.dataset_sources.base_dataset_source import DatasetEntry, BaseDatasetSource + + +class ConcreteSource(BaseDatasetSource): + DISPLAY_NAME = "Test Source" + DESCRIPTION = "A test source" + + def search(self, query, limit=20, **filters): + return [ + DatasetEntry( + id="test/dataset", + name="Test Dataset", + description="A test dataset", + tags=["tabular"], + size_bytes=1024, + row_count=100, + url="https://example.com/test", + source="ConcreteSource", + ) + ] + + def fetch_preview(self, dataset_id, n_rows=100): + import pandas as pd + return pd.DataFrame({"col_a": [1, 2], "col_b": ["x", "y"]}) + + def fetch_full(self, dataset_id, temp_path): + return ("/tmp/file.csv", "CSVDataLoader") + + def get_download_url(self, dataset_id): + return f"https://example.com/download/{dataset_id}" + + +def test_dataset_entry_fields(): + entry = DatasetEntry( + id="owner/name", + name="My Dataset", + description="desc", + tags=["nlp"], + size_bytes=2048, + row_count=500, + url="https://example.com", + source="HuggingFaceDatasetSource", + ) + assert entry.id == "owner/name" + assert entry.name == "My Dataset" + assert entry.tags == ["nlp"] + assert entry.source == "HuggingFaceDatasetSource" + + +def test_dataset_entry_optional_fields(): + entry = DatasetEntry( + id="x", + name="x", + description="", + tags=[], + size_bytes=None, + row_count=None, + url="", + source="", + ) + assert entry.size_bytes is None + assert entry.row_count is None + + +def test_concrete_source_has_type(): + assert ConcreteSource.TYPE == "DatasetSource" + + +def test_concrete_source_search_returns_entries(): + source = ConcreteSource() + results = source.search("test") + assert len(results) == 1 + assert isinstance(results[0], DatasetEntry) + assert results[0].id == "test/dataset" + + +def test_concrete_source_fetch_preview_returns_dataframe(): + import pandas as pd + source = ConcreteSource() + df = source.fetch_preview("test/dataset", n_rows=2) + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == ["col_a", "col_b"] + + +def test_concrete_source_fetch_full_returns_path_and_dataloader(): + source = ConcreteSource() + path, dataloader_name = source.fetch_full("test/dataset", "/tmp") + assert path == "/tmp/file.csv" + assert dataloader_name == "CSVDataLoader" + + +def test_concrete_source_get_download_url(): + source = ConcreteSource() + url = source.get_download_url("owner/name") + assert url == "https://example.com/download/owner/name" From a824ca757ba9a19b6e4e3ae8cd1dba9b18ce5d12 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 12:40:34 -0400 Subject: [PATCH 018/361] fix: enforce ABC contract in BaseDatasetSource --- DashAI/back/dataset_sources/base_dataset_source.py | 12 ++++++------ .../back/dataset_sources/test_base_dataset_source.py | 9 ++++++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/DashAI/back/dataset_sources/base_dataset_source.py b/DashAI/back/dataset_sources/base_dataset_source.py index 981f315a4..b201a586f 100644 --- a/DashAI/back/dataset_sources/base_dataset_source.py +++ b/DashAI/back/dataset_sources/base_dataset_source.py @@ -1,6 +1,6 @@ """Base classes for DashAI dataset sources.""" -from abc import abstractmethod +from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Final @@ -45,7 +45,7 @@ class DatasetEntry: source: str -class BaseDatasetSource(ConfigObject): +class BaseDatasetSource(ConfigObject, ABC): """Abstract base class for all DashAI dataset sources. Subclasses connect to external dataset repositories (HuggingFace Hub, @@ -75,7 +75,7 @@ def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEnt list[DatasetEntry] Matching datasets from this source. """ - raise NotImplementedError + ... @abstractmethod def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": @@ -93,7 +93,7 @@ def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": pd.DataFrame Sample rows as a pandas DataFrame. """ - raise NotImplementedError + ... @abstractmethod def fetch_full(self, dataset_id: str, temp_path: str) -> tuple[str, str]: @@ -112,7 +112,7 @@ def fetch_full(self, dataset_id: str, temp_path: str) -> tuple[str, str]: ``(local_file_path, dataloader_name)`` — path to the downloaded file and the DashAI DataLoader class name to use for loading it. """ - raise NotImplementedError + ... @abstractmethod def get_download_url(self, dataset_id: str) -> str: @@ -128,4 +128,4 @@ def get_download_url(self, dataset_id: str) -> str: str Direct download URL. """ - raise NotImplementedError + ... diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py index 2fed26208..87df21dd0 100644 --- a/tests/back/dataset_sources/test_base_dataset_source.py +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -1,6 +1,5 @@ """Tests for BaseDatasetSource and DatasetEntry.""" import pytest -from dataclasses import asdict from DashAI.back.dataset_sources.base_dataset_source import DatasetEntry, BaseDatasetSource @@ -96,3 +95,11 @@ def test_concrete_source_get_download_url(): source = ConcreteSource() url = source.get_download_url("owner/name") assert url == "https://example.com/download/owner/name" + + +def test_incomplete_subclass_cannot_be_instantiated(): + from abc import ABC + class Incomplete(BaseDatasetSource, ABC): + pass + with pytest.raises(TypeError): + Incomplete() From 7d59f87f250b020479af324e0e16dab75a56ca1b Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 12:50:41 -0400 Subject: [PATCH 019/361] feat: add HuggingFaceDatasetSource --- .../huggingface_dataset_source.py | 161 ++++++++++++++++++ .../test_base_dataset_source.py | 70 ++++++++ 2 files changed, 231 insertions(+) create mode 100644 DashAI/back/dataset_sources/huggingface_dataset_source.py diff --git a/DashAI/back/dataset_sources/huggingface_dataset_source.py b/DashAI/back/dataset_sources/huggingface_dataset_source.py new file mode 100644 index 000000000..256d38deb --- /dev/null +++ b/DashAI/back/dataset_sources/huggingface_dataset_source.py @@ -0,0 +1,161 @@ +"""HuggingFace Hub dataset source for DashAI.""" + +import logging +import os +from typing import Any, Final + +import httpx + +from DashAI.back.core.utils import MultilingualString +from DashAI.back.dataset_sources.base_dataset_source import BaseDatasetSource, DatasetEntry + +log = logging.getLogger(__name__) + +_HF_API = "https://huggingface.co/api/datasets" +_HF_ROWS_API = "https://datasets-server.huggingface.co/first-rows" + + +class HuggingFaceDatasetSource(BaseDatasetSource): + """Dataset source that fetches public datasets from HuggingFace Hub. + + Uses the HuggingFace public REST API — no authentication required for + public datasets. + """ + + DISPLAY_NAME: Final = MultilingualString( + en="HuggingFace Hub", + es="HuggingFace Hub", + ) + DESCRIPTION: Final = MultilingualString( + en="Browse and import public datasets from HuggingFace Hub.", + es="Navega e importa datasets públicos desde HuggingFace Hub.", + ) + + def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEntry]: + """Return public HuggingFace datasets matching a query. + + Parameters + ---------- + query : str + Search string passed to the HuggingFace datasets API. + limit : int, optional + Maximum number of results, by default 20. + **filters : Any + Unused; reserved for future tag/task filters. + + Returns + ------- + list[DatasetEntry] + Matching datasets. Returns empty list on API error. + """ + try: + resp = httpx.get( + _HF_API, + params={"search": query, "limit": limit, "full": "True"}, + timeout=15, + ) + if resp.status_code != 200: + log.warning("HuggingFace API returned %s", resp.status_code) + return [] + + entries = [] + for item in resp.json(): + entries.append( + DatasetEntry( + id=item.get("id", ""), + name=item.get("id", "").split("/")[-1], + description=item.get("description") or "", + tags=item.get("tags", []), + size_bytes=None, + row_count=None, + url=f"https://huggingface.co/datasets/{item.get('id', '')}", + source=self.__class__.__name__, + ) + ) + return entries + except Exception: + log.exception("Error searching HuggingFace datasets") + return [] + + def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": + """Fetch sample rows using the HuggingFace datasets-server API. + + Parameters + ---------- + dataset_id : str + HuggingFace dataset identifier (e.g. ``"stanfordnlp/imdb"``). + n_rows : int, optional + Number of rows to fetch, by default 100. + + Returns + ------- + pd.DataFrame + Sample rows. Returns empty DataFrame on error. + """ + import pandas as pd + + try: + resp = httpx.get( + _HF_ROWS_API, + params={ + "dataset": dataset_id, + "config": "default", + "split": "train", + "offset": 0, + "length": min(n_rows, 100), + }, + timeout=30, + ) + if resp.status_code != 200: + log.warning( + "HuggingFace rows API returned %s for %s", resp.status_code, dataset_id + ) + return pd.DataFrame() + + data = resp.json() + rows = [r["row"] for r in data.get("rows", [])] + return pd.DataFrame(rows) + except Exception: + log.exception("Error fetching HuggingFace preview for %s", dataset_id) + return pd.DataFrame() + + def fetch_full(self, dataset_id: str, temp_path: str) -> tuple[str, str]: + """Download the full dataset using the HuggingFace datasets library. + + Parameters + ---------- + dataset_id : str + HuggingFace dataset identifier (e.g. ``"stanfordnlp/imdb"``). + temp_path : str + Local directory to download into. + + Returns + ------- + tuple[str, str] + ``(csv_file_path, "CSVDataLoader")`` — path to the exported CSV + and the name of the DataLoader to use. + """ + from datasets import load_dataset as hf_load + + dataset = hf_load(dataset_id, cache_dir=temp_path, trust_remote_code=False) + split = "train" if "train" in dataset else list(dataset.keys())[0] + df = dataset[split].to_pandas() + + out_path = os.path.join(temp_path, f"{dataset_id.replace('/', '_')}.csv") + df.to_csv(out_path, index=False) + return (out_path, "CSVDataLoader") + + def get_download_url(self, dataset_id: str) -> str: + """Return the HuggingFace Hub page URL for the dataset. + + Parameters + ---------- + dataset_id : str + HuggingFace dataset identifier. + + Returns + ------- + str + URL to the dataset page on huggingface.co. + """ + return f"https://huggingface.co/datasets/{dataset_id}" diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py index 87df21dd0..7c46c535e 100644 --- a/tests/back/dataset_sources/test_base_dataset_source.py +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -103,3 +103,73 @@ class Incomplete(BaseDatasetSource, ABC): pass with pytest.raises(TypeError): Incomplete() + + +from unittest.mock import patch, MagicMock +from DashAI.back.dataset_sources.huggingface_dataset_source import HuggingFaceDatasetSource + + +def test_hf_source_has_correct_type(): + assert HuggingFaceDatasetSource.TYPE == "DatasetSource" + + +def test_hf_search_returns_dataset_entries(): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + { + "id": "stanfordnlp/imdb", + "description": "IMDB movie review sentiment", + "tags": ["text-classification"], + "cardData": {"size_categories": ["10K Date: Tue, 5 May 2026 12:52:32 -0400 Subject: [PATCH 020/361] feat: add OpenMLDatasetSource --- .../dataset_sources/openml_dataset_source.py | 201 ++++++++++++++++++ .../test_base_dataset_source.py | 82 +++++++ 2 files changed, 283 insertions(+) create mode 100644 DashAI/back/dataset_sources/openml_dataset_source.py diff --git a/DashAI/back/dataset_sources/openml_dataset_source.py b/DashAI/back/dataset_sources/openml_dataset_source.py new file mode 100644 index 000000000..cd27801ae --- /dev/null +++ b/DashAI/back/dataset_sources/openml_dataset_source.py @@ -0,0 +1,201 @@ +"""OpenML dataset source for DashAI.""" + +import io +import logging +import os +from typing import Any, Final + +import httpx + +from DashAI.back.core.utils import MultilingualString +from DashAI.back.dataset_sources.base_dataset_source import BaseDatasetSource, DatasetEntry + +log = logging.getLogger(__name__) + +_OPENML_API = "https://www.openml.org/api/v1/json" +_OPENML_DATA = "https://data.openml.org/data/v1/download" + + +def _parse_quality(qualities: list[dict], name: str) -> int | None: + """Extract a numeric quality value by name from an OpenML quality list. + + Parameters + ---------- + qualities : list[dict] + List of ``{"name": str, "value": str}`` dicts from OpenML API. + name : str + Quality name to look up (e.g. ``"NumberOfInstances"``). + + Returns + ------- + int or None + Parsed integer value, or None if not found or not numeric. + """ + for q in qualities: + if q.get("name") == name: + try: + return int(float(q["value"])) + except (ValueError, KeyError): + return None + return None + + +class OpenMLDatasetSource(BaseDatasetSource): + """Dataset source that fetches public datasets from OpenML. + + Uses the OpenML public REST API — no authentication required. + """ + + DISPLAY_NAME: Final = MultilingualString( + en="OpenML", + es="OpenML", + ) + DESCRIPTION: Final = MultilingualString( + en="Browse and import public datasets from OpenML.", + es="Navega e importa datasets públicos desde OpenML.", + ) + + def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEntry]: + """Return active OpenML datasets matching a name query. + + Parameters + ---------- + query : str + Dataset name search string. + limit : int, optional + Maximum number of results, by default 20. + **filters : Any + Unused; reserved for future filters. + + Returns + ------- + list[DatasetEntry] + Matching datasets. Returns empty list on API error. + """ + try: + resp = httpx.get( + f"{_OPENML_API}/data/list", + params={"data_name": query, "limit": limit, "status": "active"}, + timeout=15, + ) + if resp.status_code != 200: + log.warning("OpenML API returned %s", resp.status_code) + return [] + + items = resp.json().get("data", {}).get("dataset", []) + entries = [] + for item in items: + did = str(item.get("did", "")) + qualities = item.get("quality", []) + tag_raw = item.get("tag", []) + tags = [tag_raw] if isinstance(tag_raw, str) else tag_raw + entries.append( + DatasetEntry( + id=did, + name=item.get("name", ""), + description=item.get("description") or "", + tags=tags, + size_bytes=None, + row_count=_parse_quality(qualities, "NumberOfInstances"), + url=f"https://www.openml.org/d/{did}", + source=self.__class__.__name__, + ) + ) + return entries + except Exception: + log.exception("Error searching OpenML datasets") + return [] + + def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": + """Download and parse sample rows from an OpenML dataset ARFF file. + + Parameters + ---------- + dataset_id : str + OpenML dataset ID (integer as string, e.g. ``"61"``). + n_rows : int, optional + Maximum rows to return, by default 100. + + Returns + ------- + pd.DataFrame + Sample rows. Returns empty DataFrame on error. + """ + import pandas as pd + from scipy.io import arff as scipy_arff + + try: + info_resp = httpx.get( + f"{_OPENML_API}/data/{dataset_id}", + timeout=15, + ) + if info_resp.status_code != 200: + return pd.DataFrame() + + file_id = info_resp.json()["data_set_description"]["file_id"] + file_resp = httpx.get( + f"{_OPENML_DATA}/{file_id}", + timeout=60, + ) + if file_resp.status_code != 200: + return pd.DataFrame() + + arff_text = file_resp.content.decode("utf-8", errors="replace") + data, meta = scipy_arff.loadarff(io.StringIO(arff_text)) + df = pd.DataFrame(data) + for col in df.select_dtypes(include=["object"]).columns: + df[col] = df[col].str.decode("utf-8", errors="replace") + return df.head(n_rows) + except Exception: + log.exception("Error fetching OpenML preview for dataset %s", dataset_id) + return pd.DataFrame() + + def fetch_full(self, dataset_id: str, temp_path: str) -> tuple[str, str]: + """Download the full OpenML dataset as CSV. + + Parameters + ---------- + dataset_id : str + OpenML dataset ID (integer as string, e.g. ``"61"``). + temp_path : str + Local directory to download into. + + Returns + ------- + tuple[str, str] + ``(csv_file_path, "CSVDataLoader")``. + """ + import pandas as pd + from scipy.io import arff as scipy_arff + + info_resp = httpx.get(f"{_OPENML_API}/data/{dataset_id}", timeout=15) + info_resp.raise_for_status() + file_id = info_resp.json()["data_set_description"]["file_id"] + + file_resp = httpx.get(f"{_OPENML_DATA}/{file_id}", timeout=120) + file_resp.raise_for_status() + + arff_text = file_resp.content.decode("utf-8", errors="replace") + data, _ = scipy_arff.loadarff(io.StringIO(arff_text)) + df = pd.DataFrame(data) + for col in df.select_dtypes(include=["object"]).columns: + df[col] = df[col].str.decode("utf-8", errors="replace") + + out_path = os.path.join(temp_path, f"openml_{dataset_id}.csv") + df.to_csv(out_path, index=False) + return (out_path, "CSVDataLoader") + + def get_download_url(self, dataset_id: str) -> str: + """Return the OpenML dataset page URL. + + Parameters + ---------- + dataset_id : str + OpenML dataset ID. + + Returns + ------- + str + URL to the dataset page on openml.org. + """ + return f"https://www.openml.org/d/{dataset_id}" diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py index 7c46c535e..63c03eb52 100644 --- a/tests/back/dataset_sources/test_base_dataset_source.py +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -173,3 +173,85 @@ def test_hf_fetch_preview_returns_dataframe(): assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["text", "label"] assert len(df) == 2 + + +from DashAI.back.dataset_sources.openml_dataset_source import OpenMLDatasetSource + + +def test_openml_source_has_correct_type(): + assert OpenMLDatasetSource.TYPE == "DatasetSource" + + +def test_openml_search_returns_dataset_entries(): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": { + "dataset": [ + { + "did": 61, + "name": "iris", + "description": "The Iris dataset", + "tag": ["study_14", "uci"], + "file_id": 22044555, + "quality": [{"name": "NumberOfInstances", "value": "150"}], + } + ] + } + } + + with patch("httpx.get", return_value=mock_response): + source = OpenMLDatasetSource() + results = source.search("iris", limit=5) + + assert len(results) == 1 + assert results[0].id == "61" + assert results[0].name == "iris" + assert results[0].row_count == 150 + assert results[0].source == "OpenMLDatasetSource" + assert results[0].url == "https://www.openml.org/d/61" + + +def test_openml_search_handles_http_error(): + mock_response = MagicMock() + mock_response.status_code = 500 + + with patch("httpx.get", return_value=mock_response): + source = OpenMLDatasetSource() + results = source.search("iris") + + assert results == [] + + +def test_openml_get_download_url(): + source = OpenMLDatasetSource() + url = source.get_download_url("61") + assert url == "https://www.openml.org/d/61" + + +def test_openml_fetch_preview_returns_dataframe(): + import pandas as pd + + info_response = MagicMock() + info_response.status_code = 200 + info_response.json.return_value = { + "data_set_description": {"file_id": "22044555"} + } + + arff_content = b"""@relation iris +@attribute sepalLength numeric +@attribute class {Iris-setosa,Iris-versicolor} +@data +5.1,Iris-setosa +4.9,Iris-versicolor +""" + file_response = MagicMock() + file_response.status_code = 200 + file_response.content = arff_content + + with patch("httpx.get", side_effect=[info_response, file_response]): + source = OpenMLDatasetSource() + df = source.fetch_preview("61", n_rows=2) + + assert isinstance(df, pd.DataFrame) + assert "sepalLength" in df.columns From dc5c750f14c9d0bc99043b30ba9eddcb62fc58b4 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 12:56:51 -0400 Subject: [PATCH 021/361] feat: add dataset source API endpoints and register sources --- DashAI/back/api/api_v1/api.py | 2 + .../api/api_v1/endpoints/dataset_source.py | 191 ++++++++++++++++++ DashAI/back/initial_components.py | 7 + tests/back/api/test_dataset_source_api.py | 101 +++++++++ 4 files changed, 301 insertions(+) create mode 100644 DashAI/back/api/api_v1/endpoints/dataset_source.py create mode 100644 tests/back/api/test_dataset_source_api.py diff --git a/DashAI/back/api/api_v1/api.py b/DashAI/back/api/api_v1/api.py index 6a3476db2..897fe5b7e 100644 --- a/DashAI/back/api/api_v1/api.py +++ b/DashAI/back/api/api_v1/api.py @@ -20,6 +20,7 @@ from DashAI.back.api.api_v1.endpoints.plugins import router as plugins from DashAI.back.api.api_v1.endpoints.predict import router as predict from DashAI.back.api.api_v1.endpoints.runs import router as runs +from DashAI.back.api.api_v1.endpoints.dataset_source import router as dataset_source from DashAI.back.api.api_v1.endpoints.scoring import router as scoring api_router_v1 = APIRouter() @@ -40,3 +41,4 @@ api_router_v1.include_router(metrics, prefix="/metrics") api_router_v1.include_router(hardware, prefix="/hardware") api_router_v1.include_router(scoring, prefix="/scoring") +api_router_v1.include_router(dataset_source, prefix="/dataset-source") diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py new file mode 100644 index 000000000..96dd603d2 --- /dev/null +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -0,0 +1,191 @@ +"""Dataset source API endpoints.""" + +import logging +from typing import TYPE_CHECKING, Any, Dict, List +from urllib.parse import unquote + +from fastapi import APIRouter, Depends, Query, status +from fastapi.exceptions import HTTPException +from kink import di + +from DashAI.back.types.inf.type_inference import infer_types + +if TYPE_CHECKING: + from DashAI.back.dependencies.registry import ComponentRegistry + +log = logging.getLogger(__name__) +router = APIRouter() + + +def _get_source(source_name: str, registry: "ComponentRegistry"): + """Retrieve and instantiate a DatasetSource from the registry. + + Parameters + ---------- + source_name : str + Registered class name of the DatasetSource. + registry : ComponentRegistry + The component registry to look up. + + Returns + ------- + BaseDatasetSource + Instantiated source object. + + Raises + ------ + HTTPException + 404 if source_name is not found in the DatasetSource registry. + """ + sources = registry._registry.get("DatasetSource", {}) + if source_name not in sources: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"DatasetSource '{source_name}' not found.", + ) + return sources[source_name]["class"]() + + +@router.get("/", response_model=List[Dict[str, Any]]) +async def list_sources( + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), +) -> List[Dict[str, Any]]: + """Return all registered DatasetSource components. + + Parameters + ---------- + registry : ComponentRegistry + Injected component registry. + + Returns + ------- + list[dict] + List of source metadata dicts with name, type, display_name, description. + """ + sources = registry._registry.get("DatasetSource", {}) + return [ + { + "name": name, + "type": "DatasetSource", + "display_name": str(getattr(info["class"], "DISPLAY_NAME", name)), + "description": str(getattr(info["class"], "DESCRIPTION", "")), + } + for name, info in sources.items() + ] + + +@router.get("/{source_name}/search") +async def search_datasets( + source_name: str, + q: str = Query(default="", description="Search query"), + limit: int = Query(default=20, ge=1, le=100), + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), +) -> List[Dict[str, Any]]: + """Search for datasets in a registered source. + + Parameters + ---------- + source_name : str + Registered DatasetSource class name. + q : str + Search query string. + limit : int + Maximum number of results (1-100). + registry : ComponentRegistry + Injected component registry. + + Returns + ------- + list[dict] + List of DatasetEntry dicts. + """ + source = _get_source(source_name, registry) + results = source.search(q, limit=limit) + return [ + { + "id": e.id, + "name": e.name, + "description": e.description, + "tags": e.tags, + "size_bytes": e.size_bytes, + "row_count": e.row_count, + "url": e.url, + "source": e.source, + } + for e in results + ] + + +@router.get("/{source_name}/{dataset_id:path}/download-url") +async def get_download_url( + source_name: str, + dataset_id: str, + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), +) -> Dict[str, str]: + """Return the direct download URL for a dataset. + + Parameters + ---------- + source_name : str + Registered DatasetSource class name. + dataset_id : str + Source-specific dataset identifier (URL-encoded). + registry : ComponentRegistry + Injected component registry. + + Returns + ------- + dict + ``{"url": ""}``. + """ + source = _get_source(source_name, registry) + url = source.get_download_url(unquote(dataset_id)) + return {"url": url} + + +@router.get("/{source_name}/{dataset_id:path}/preview") +async def preview_dataset( + source_name: str, + dataset_id: str, + n_rows: int = Query(default=100, ge=1, le=500), + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), +) -> Dict[str, Any]: + """Fetch a sample preview of a dataset with inferred DashAI column types. + + Parameters + ---------- + source_name : str + Registered DatasetSource class name. + dataset_id : str + Source-specific dataset identifier (URL-encoded). + n_rows : int + Number of sample rows (1-500). + registry : ComponentRegistry + Injected component registry. + + Returns + ------- + dict + ``{"sample": [...], "inferred_types": {...}, "preview_row_count": int}`` + matching the format expected by the PreviewDataset frontend component. + """ + source = _get_source(source_name, registry) + decoded_id = unquote(dataset_id) + + try: + df = source.fetch_preview(decoded_id, n_rows=n_rows) + except Exception as exc: + log.exception("Error fetching preview for %s/%s", source_name, decoded_id) + raise HTTPException( + status_code=status.HTTP_502_BAD_GATEWAY, + detail=f"Failed to fetch preview from source: {exc}", + ) from exc + + inferred = infer_types(df, method="DashAIPtype") + sample = df.to_dict(orient="records") + + return { + "sample": sample, + "inferred_types": inferred, + "preview_row_count": len(df), + } diff --git a/DashAI/back/initial_components.py b/DashAI/back/initial_components.py index 0a01fb093..1546e8586 100644 --- a/DashAI/back/initial_components.py +++ b/DashAI/back/initial_components.py @@ -66,6 +66,10 @@ from DashAI.back.dataloaders.classes.excel_dataloader import ExcelDataLoader from DashAI.back.dataloaders.classes.json_dataloader import JSONDataLoader +# Dataset Sources +from DashAI.back.dataset_sources.huggingface_dataset_source import HuggingFaceDatasetSource +from DashAI.back.dataset_sources.openml_dataset_source import OpenMLDatasetSource + # Explainers from DashAI.back.explainability.explainers.kernel_shap import KernelShap from DashAI.back.explainability.explainers.partial_dependence import PartialDependence @@ -280,6 +284,9 @@ def get_initial_components(): CSVDataLoader, JSONDataLoader, ExcelDataLoader, + # Dataset Sources + HuggingFaceDatasetSource, + OpenMLDatasetSource, # Metrics F1, Accuracy, diff --git a/tests/back/api/test_dataset_source_api.py b/tests/back/api/test_dataset_source_api.py new file mode 100644 index 000000000..224cea04a --- /dev/null +++ b/tests/back/api/test_dataset_source_api.py @@ -0,0 +1,101 @@ +"""Tests for the dataset_source API endpoints.""" +import pytest +from fastapi.testclient import TestClient + +from DashAI.back.dataset_sources.base_dataset_source import BaseDatasetSource, DatasetEntry +from DashAI.back.dependencies.registry import ComponentRegistry + + +class MockDatasetSource(BaseDatasetSource): + DISPLAY_NAME = "Mock Source" + DESCRIPTION = "Mock for testing" + + def search(self, query, limit=20, **filters): + if query == "error": + return [] + return [ + DatasetEntry( + id="mock/dataset", + name="Mock Dataset", + description="A mock dataset", + tags=["tabular"], + size_bytes=1024, + row_count=100, + url="https://mock.example.com/mock-dataset", + source="MockDatasetSource", + ) + ] + + def fetch_preview(self, dataset_id, n_rows=100): + import pandas as pd + return pd.DataFrame({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}) + + def fetch_full(self, dataset_id, temp_path): + return ("/tmp/mock.csv", "CSVDataLoader") + + def get_download_url(self, dataset_id): + return f"https://mock.example.com/{dataset_id}" + + @classmethod + def get_schema(cls): + return {} + + +@pytest.fixture(autouse=True, name="test_registry_hub") +def setup_test_registry(client, monkeypatch): + container = client.app.container + test_registry = ComponentRegistry(initial_components=[MockDatasetSource]) + monkeypatch.setitem(container._services, "component_registry", test_registry) + return test_registry + + +def test_list_sources(client: TestClient): + response = client.get("/api/v1/dataset-source/") + assert response.status_code == 200 + data = response.json() + assert len(data) == 1 + assert data[0]["name"] == "MockDatasetSource" + assert data[0]["type"] == "DatasetSource" + + +def test_search_returns_entries(client: TestClient): + response = client.get("/api/v1/dataset-source/MockDatasetSource/search?q=test&limit=5") + assert response.status_code == 200 + data = response.json() + assert len(data) == 1 + assert data[0]["id"] == "mock/dataset" + assert data[0]["name"] == "Mock Dataset" + + +def test_search_empty_result(client: TestClient): + response = client.get("/api/v1/dataset-source/MockDatasetSource/search?q=error") + assert response.status_code == 200 + assert response.json() == [] + + +def test_search_unknown_source(client: TestClient): + response = client.get("/api/v1/dataset-source/UnknownSource/search?q=test") + assert response.status_code == 404 + + +def test_get_download_url(client: TestClient): + response = client.get("/api/v1/dataset-source/MockDatasetSource/mock%2Fdataset/download-url") + assert response.status_code == 200 + assert response.json() == {"url": "https://mock.example.com/mock/dataset"} + + +def test_get_download_url_unknown_source(client: TestClient): + response = client.get("/api/v1/dataset-source/Unknown/some-id/download-url") + assert response.status_code == 404 + + +def test_get_preview(client: TestClient): + response = client.get( + "/api/v1/dataset-source/MockDatasetSource/mock%2Fdataset/preview?n_rows=3" + ) + assert response.status_code == 200 + data = response.json() + assert "sample" in data + assert "inferred_types" in data + assert "preview_row_count" in data + assert len(data["sample"]) == 3 From 17cafed40950e9898103c17acc08fbfa47936b1e Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:26:57 -0400 Subject: [PATCH 022/361] feat: add DatasetJob source branch for Hub imports --- .../api/api_v1/endpoints/dataset_source.py | 64 +++++++++++++++++++ DashAI/back/job/dataset_job.py | 59 +++++++++++++---- tests/back/api/test_dataset_source_api.py | 62 +++++++++++++++++- 3 files changed, 172 insertions(+), 13 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index 96dd603d2..811e20fdb 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -7,6 +7,7 @@ from fastapi import APIRouter, Depends, Query, status from fastapi.exceptions import HTTPException from kink import di +from pydantic import BaseModel from DashAI.back.types.inf.type_inference import infer_types @@ -189,3 +190,66 @@ async def preview_dataset( "inferred_types": inferred, "preview_row_count": len(df), } + + +class ImportRequest(BaseModel): + """Request body for the dataset import endpoint. + + Parameters + ---------- + dataset_id : int + ID of a pre-created Dataset DB record to populate. + params : dict + Parameters including ``inferred_types`` and ``column_renames``. + """ + + dataset_id: int + params: Dict[str, Any] = {} + + +@router.post("/{source_name}/{dataset_id:path}/import", status_code=status.HTTP_201_CREATED) +async def import_dataset( + source_name: str, + dataset_id: str, + body: ImportRequest, + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), + job_queue=Depends(lambda: di["job_queue"]), +) -> Dict[str, Any]: + """Enqueue a DatasetJob to import a dataset from an external source. + + Parameters + ---------- + source_name : str + Registered DatasetSource class name. + dataset_id : str + Source-specific dataset identifier (URL-encoded). + body : ImportRequest + Contains the DashAI dataset_id and params. + registry : ComponentRegistry + Injected component registry. + job_queue : BaseJobQueue + Injected job queue. + + Returns + ------- + dict + ``{"job_id": int, "dataset_id": int}`` — the enqueued job and dataset IDs. + """ + from DashAI.back.job.dataset_job import DatasetJob + + _get_source(source_name, registry) # validates source exists, raises 404 if not + + job = DatasetJob( + kwargs={ + "dataset_id": body.dataset_id, + "source_name": source_name, + "dataset_source_id": unquote(dataset_id), + "params": body.params, + } + ) + job.set_status_as_delivered() + result = job_queue.put(job) + # huey.api.Result has .id (task UUID string); plain int in other modes + job_id = getattr(result, "id", result) + + return {"job_id": job_id, "dataset_id": body.dataset_id} diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 2ab1ceb98..f948b95a7 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -162,17 +162,54 @@ def run( ) else: - parsed_params = parse_params(DatasetParams, json.dumps(params)) - dataloader = component_registry[parsed_params.dataloader]["class"]() - log.debug("Storing dataset in %s", folder_path) - new_dataset = dataloader.load_data( - filepath_or_buffer=( - str(file_path) if file_path is not None else url - ), - temp_path=str(temp_dir), - params=parsed_params.model_dump(), - n_sample=n_sample, - ) + source_name = self.kwargs.get("source_name") + + if source_name: + # --- Hub import path --- + import tempfile + hub_temp = tempfile.mkdtemp() + temp_dir = hub_temp # ensures finally block cleans it up + + dataset_source_id = self.kwargs.get("dataset_source_id", "") + sources = component_registry._registry.get("DatasetSource", {}) + if source_name not in sources: + raise JobError( + f"DatasetSource '{source_name}' not found in registry." + ) + source = sources[source_name]["class"]() + file_path_hub, dataloader_name = source.fetch_full( + dataset_source_id, hub_temp + ) + dl_registry = component_registry._registry.get("DataLoader", {}) + if dataloader_name not in dl_registry: + raise JobError( + f"DataLoader '{dataloader_name}' not found in registry." + ) + dataloader = dl_registry[dataloader_name]["class"]() + log.debug( + "Loading hub dataset from %s using %s", + file_path_hub, + dataloader_name, + ) + new_dataset = dataloader.load_data( + filepath_or_buffer=file_path_hub, + temp_path=hub_temp, + params={}, + n_sample=None, + ) + else: + # --- File / URL upload path (unchanged) --- + parsed_params = parse_params(DatasetParams, json.dumps(params)) + dataloader = component_registry[parsed_params.dataloader]["class"]() + log.debug("Storing dataset in %s", folder_path) + new_dataset = dataloader.load_data( + filepath_or_buffer=( + str(file_path) if file_path is not None else url + ), + temp_path=str(temp_dir), + params=parsed_params.model_dump(), + n_sample=n_sample, + ) if "inferred_types" in params: schema = params["inferred_types"] diff --git a/tests/back/api/test_dataset_source_api.py b/tests/back/api/test_dataset_source_api.py index 224cea04a..b5193a5e5 100644 --- a/tests/back/api/test_dataset_source_api.py +++ b/tests/back/api/test_dataset_source_api.py @@ -1,11 +1,39 @@ """Tests for the dataset_source API endpoints.""" +import os +from typing import Any, Dict + import pytest from fastapi.testclient import TestClient from DashAI.back.dataset_sources.base_dataset_source import BaseDatasetSource, DatasetEntry +from DashAI.back.dataloaders.classes.dataloader import BaseDataLoader from DashAI.back.dependencies.registry import ComponentRegistry +class MockDataLoader(BaseDataLoader): + """Minimal DataLoader for testing the hub import path.""" + + name = "MockDataLoader" + + @classmethod + def get_schema(cls): + return {} + + def load_data( + self, + filepath_or_buffer: str, + temp_path: str, + params: Dict[str, Any], + n_sample=None, + ): + import pandas as pd + + from DashAI.back.dataloaders.classes.dashai_dataset import to_dashai_dataset + + df = pd.read_csv(filepath_or_buffer) + return to_dashai_dataset(df) + + class MockDatasetSource(BaseDatasetSource): DISPLAY_NAME = "Mock Source" DESCRIPTION = "Mock for testing" @@ -31,7 +59,10 @@ def fetch_preview(self, dataset_id, n_rows=100): return pd.DataFrame({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}) def fetch_full(self, dataset_id, temp_path): - return ("/tmp/mock.csv", "CSVDataLoader") + csv_path = os.path.join(temp_path, "mock.csv") + with open(csv_path, "w") as f: + f.write("col_a,col_b\n1,x\n2,y\n3,z\n") + return (csv_path, "MockDataLoader") def get_download_url(self, dataset_id): return f"https://mock.example.com/{dataset_id}" @@ -44,7 +75,7 @@ def get_schema(cls): @pytest.fixture(autouse=True, name="test_registry_hub") def setup_test_registry(client, monkeypatch): container = client.app.container - test_registry = ComponentRegistry(initial_components=[MockDatasetSource]) + test_registry = ComponentRegistry(initial_components=[MockDatasetSource, MockDataLoader]) monkeypatch.setitem(container._services, "component_registry", test_registry) return test_registry @@ -99,3 +130,30 @@ def test_get_preview(client: TestClient): assert "inferred_types" in data assert "preview_row_count" in data assert len(data["sample"]) == 3 + + +def test_import_endpoint_creates_dataset_and_job(client: TestClient): + """POST import creates a Dataset record and enqueues a DatasetJob.""" + create_resp = client.post("/api/v1/dataset/", json={"name": "hub_import_test"}) + assert create_resp.status_code == 201 + dataset_id = create_resp.json()["id"] + + response = client.post( + "/api/v1/dataset-source/MockDatasetSource/mock%2Fdataset/import", + json={ + "dataset_id": dataset_id, + "params": {}, + }, + ) + assert response.status_code == 201 + data = response.json() + assert "job_id" in data + assert data["dataset_id"] == dataset_id + + +def test_import_endpoint_unknown_source(client: TestClient): + response = client.post( + "/api/v1/dataset-source/Unknown/some%2Fdataset/import", + json={"dataset_id": 999, "params": {}}, + ) + assert response.status_code == 404 From 9d9236d6b222dbc3b60038d467bb63bf4f158521 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:28:50 -0400 Subject: [PATCH 023/361] feat: add Hub API client and i18n strings --- DashAI/front/src/api/hub.ts | 82 +++++++++++++++++++ DashAI/front/src/utils/i18n/index.js | 5 ++ .../src/utils/i18n/locales/en/common.json | 1 + .../front/src/utils/i18n/locales/en/hub.json | 22 +++++ .../src/utils/i18n/locales/es/common.json | 1 + .../front/src/utils/i18n/locales/es/hub.json | 22 +++++ 6 files changed, 133 insertions(+) create mode 100644 DashAI/front/src/api/hub.ts create mode 100644 DashAI/front/src/utils/i18n/locales/en/hub.json create mode 100644 DashAI/front/src/utils/i18n/locales/es/hub.json diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts new file mode 100644 index 000000000..40c108836 --- /dev/null +++ b/DashAI/front/src/api/hub.ts @@ -0,0 +1,82 @@ +import api from "./api"; + +const hubEndpoint = "/v1/dataset-source"; + +export interface DatasetSourceInfo { + name: string; + type: string; + display_name: string; + description: string; +} + +export interface DatasetEntry { + id: string; + name: string; + description: string; + tags: string[]; + size_bytes: number | null; + row_count: number | null; + url: string; + source: string; +} + +export interface DatasetPreview { + sample: Record[]; + inferred_types: Record; + preview_row_count: number; +} + +export const getDatasetSources = async (): Promise => { + const response = await api.get(`${hubEndpoint}/`); + return response.data; +}; + +export const searchDatasets = async ( + sourceName: string, + query: string, + limit = 20, +): Promise => { + const response = await api.get( + `${hubEndpoint}/${sourceName}/search`, + { params: { q: query, limit } }, + ); + return response.data; +}; + +export const getDownloadUrl = async ( + sourceName: string, + datasetId: string, +): Promise => { + const encodedId = encodeURIComponent(datasetId); + const response = await api.get<{ url: string }>( + `${hubEndpoint}/${sourceName}/${encodedId}/download-url`, + ); + return response.data.url; +}; + +export const previewHubDataset = async ( + sourceName: string, + datasetId: string, + nRows = 100, +): Promise => { + const encodedId = encodeURIComponent(datasetId); + const response = await api.get( + `${hubEndpoint}/${sourceName}/${encodedId}/preview`, + { params: { n_rows: nRows } }, + ); + return response.data; +}; + +export const importHubDataset = async ( + sourceName: string, + datasetId: string, + dashaiDatasetId: number, + params: Record, +): Promise<{ job_id: string; dataset_id: number }> => { + const encodedId = encodeURIComponent(datasetId); + const response = await api.post<{ job_id: string; dataset_id: number }>( + `${hubEndpoint}/${sourceName}/${encodedId}/import`, + { dataset_id: dashaiDatasetId, params }, + ); + return response.data; +}; diff --git a/DashAI/front/src/utils/i18n/index.js b/DashAI/front/src/utils/i18n/index.js index 69b660c74..448e8ef3e 100644 --- a/DashAI/front/src/utils/i18n/index.js +++ b/DashAI/front/src/utils/i18n/index.js @@ -35,6 +35,8 @@ import pluginsEN from "./locales/en/plugins.json"; import pluginsES from "./locales/es/plugins.json"; import generativeTourEN from "./locales/en/generativeTour.json"; import generativeTourES from "./locales/es/generativeTour.json"; +import hubEN from "./locales/en/hub.json"; +import hubES from "./locales/es/hub.json"; // the translations // (tip move them in a JSON file and import them, @@ -58,6 +60,7 @@ const resources = { modelsTour: modelsTourEN, modelsSessionTour: modelsSessionTourEN, generativeTour: generativeTourEN, + hub: hubEN, }, es: { configurableObject: configurableObjectES, @@ -77,6 +80,7 @@ const resources = { modelsTour: modelsTourES, modelsSessionTour: modelsSessionTourES, generativeTour: generativeTourES, + hub: hubES, }, }; @@ -106,6 +110,7 @@ i18n "modelsSessionTour", "plugins", "generativeTour", + "hub", ], defaultNS: "common", diff --git a/DashAI/front/src/utils/i18n/locales/en/common.json b/DashAI/front/src/utils/i18n/locales/en/common.json index 90e7e9af0..18edf3632 100644 --- a/DashAI/front/src/utils/i18n/locales/en/common.json +++ b/DashAI/front/src/utils/i18n/locales/en/common.json @@ -1,5 +1,6 @@ { "actions": "Actions", + "hub": "Hub", "add": "Add", "addModel": "Add Model", "addRow": "Add Row", diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json new file mode 100644 index 000000000..bc591198b --- /dev/null +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -0,0 +1,22 @@ +{ + "title": "Dataset Hub", + "searchPlaceholder": "Search datasets...", + "noSourceSelected": "Select a source from the left to browse datasets.", + "noResults": "No datasets found. Try a different search term.", + "loadingDatasets": "Loading datasets...", + "loadingSources": "Loading sources...", + "rows": "rows", + "tags": "Tags", + "source": "Source", + "size": "Size", + "viewOnSource": "View on source", + "addToDashAI": "Add to DashAI", + "download": "Download", + "importDataset": "Import Dataset", + "datasetName": "Dataset name", + "importing": "Importing...", + "importSuccess": "Dataset imported successfully.", + "importError": "Failed to import dataset.", + "previewError": "Failed to load dataset preview.", + "selectDatasetToPreview": "Select a dataset to see details." +} diff --git a/DashAI/front/src/utils/i18n/locales/es/common.json b/DashAI/front/src/utils/i18n/locales/es/common.json index d9a4f55af..eb86a9118 100644 --- a/DashAI/front/src/utils/i18n/locales/es/common.json +++ b/DashAI/front/src/utils/i18n/locales/es/common.json @@ -1,5 +1,6 @@ { "actions": "Acciones", + "hub": "Hub", "add": "Agregar", "addModel": "Agregar Modelo", "addRow": "Agregar Fila", diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json new file mode 100644 index 000000000..3a51ee3c0 --- /dev/null +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -0,0 +1,22 @@ +{ + "title": "Hub de Datasets", + "searchPlaceholder": "Buscar datasets...", + "noSourceSelected": "Selecciona una fuente a la izquierda para explorar datasets.", + "noResults": "No se encontraron datasets. Prueba otro término de búsqueda.", + "loadingDatasets": "Cargando datasets...", + "loadingSources": "Cargando fuentes...", + "rows": "filas", + "tags": "Etiquetas", + "source": "Fuente", + "size": "Tamaño", + "viewOnSource": "Ver en la fuente", + "addToDashAI": "Agregar a DashAI", + "download": "Descargar", + "importDataset": "Importar Dataset", + "datasetName": "Nombre del dataset", + "importing": "Importando...", + "importSuccess": "Dataset importado exitosamente.", + "importError": "Error al importar el dataset.", + "previewError": "Error al cargar la vista previa.", + "selectDatasetToPreview": "Selecciona un dataset para ver los detalles." +} From 081c1e5694a1005f2c206ae23a5b668e5c34b092 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:31:30 -0400 Subject: [PATCH 024/361] feat: add initialData prop to PreviewDataset for Hub import flow --- .../datasetCreation/PreviewDataset.jsx | 89 ++++++++++++------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx index 70ade95cc..236c0ff8c 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx @@ -16,6 +16,7 @@ import { useTranslation } from "react-i18next"; * @param {function} onTypesChanged - Callback to notify parent when column types change * @param {function} onColumnRename - Callback to notify parent when columns are renamed (oldName, newName) * @param {function} onPreviewLoaded - Callback to notify parent when preview is loaded + * @param {object} initialData - Pre-fetched preview data; when provided, skips the previewWithTypes API call */ function PreviewDataset({ datasetData, @@ -24,6 +25,7 @@ function PreviewDataset({ onTypesChanged, onColumnRename, onPreviewLoaded, + initialData = null, }) { const theme = useTheme(); const { enqueueSnackbar } = useSnackbar(); @@ -51,6 +53,17 @@ function PreviewDataset({ }, [loading, error, onPreviewLoaded]); useEffect(() => { + if (initialData) { + setPreviewData(initialData); + setColumnTypes(initialData.inferred_types); + if (onTypesChangedRef.current) { + onTypesChangedRef.current(initialData.inferred_types); + } + setError(null); + setLoading(false); + return; + } + const loadPreview = async () => { if (!datasetData) { setError(t("datasets:error.noDatasetDataAvailable")); @@ -92,7 +105,7 @@ function PreviewDataset({ // Re-run preview when the file changes OR when params change. We stringify params to // create a stable dependency so changes to configuration in the right sidebar // trigger a new preview request. - }, [datasetData?.file, JSON.stringify(datasetData?.params || {})]); + }, [initialData, datasetData?.file, JSON.stringify(datasetData?.params || {})]); const handleTypeChange = useCallback( (typeChanges) => { @@ -232,48 +245,50 @@ function PreviewDataset({ width: "100%", }} > - - - {t("datasets:label.showingRowsInference", { - sampleLength: previewData.sample.length, - previewRowCount: previewData.preview_row_count, - })} -
- {t("datasets:label.changeColumnTypesInfo")} -
- - -
+ + {t("datasets:label.showingRowsInference", { + sampleLength: previewData.sample.length, + previewRowCount: previewData.preview_row_count, + })} +
+ {t("datasets:label.changeColumnTypesInfo")} +
+ + +
+ )} Date: Tue, 5 May 2026 15:33:00 -0400 Subject: [PATCH 025/361] feat: add HubLeftBar, DatasetCard, and DatasetGrid components --- .../front/src/components/hub/DatasetCard.jsx | 76 ++++++++++ .../front/src/components/hub/DatasetGrid.jsx | 134 ++++++++++++++++++ .../front/src/components/hub/HubLeftBar.jsx | 90 ++++++++++++ 3 files changed, 300 insertions(+) create mode 100644 DashAI/front/src/components/hub/DatasetCard.jsx create mode 100644 DashAI/front/src/components/hub/DatasetGrid.jsx create mode 100644 DashAI/front/src/components/hub/HubLeftBar.jsx diff --git a/DashAI/front/src/components/hub/DatasetCard.jsx b/DashAI/front/src/components/hub/DatasetCard.jsx new file mode 100644 index 000000000..3cacd5256 --- /dev/null +++ b/DashAI/front/src/components/hub/DatasetCard.jsx @@ -0,0 +1,76 @@ +import { Box, Chip, Paper, Stack, Typography } from "@mui/material"; +import StorageIcon from "@mui/icons-material/Storage"; +import { useTranslation } from "react-i18next"; + +/** + * Card displaying a single dataset from the Hub. + * + * @param {object} dataset - DatasetEntry object. + * @param {boolean} selected - Whether this card is currently selected. + * @param {function} onSelect - Called when the card is clicked. + */ +export default function DatasetCard({ dataset, selected, onSelect }) { + const { t } = useTranslation(["hub"]); + + const formatSize = (bytes) => { + if (!bytes) return null; + if (bytes < 1024) return `${bytes} B`; + if (bytes < 1024 ** 2) return `${(bytes / 1024).toFixed(1)} KB`; + if (bytes < 1024 ** 3) return `${(bytes / 1024 ** 2).toFixed(1)} MB`; + return `${(bytes / 1024 ** 3).toFixed(1)} GB`; + }; + + return ( + + + {dataset.name} + + + {dataset.description && ( + + {dataset.description} + + )} + + + {dataset.tags?.slice(0, 3).map((tag) => ( + + ))} + + + + + + {dataset.row_count + ? `${dataset.row_count.toLocaleString()} ${t("hub:rows")}` + : "—"} + {dataset.size_bytes ? ` · ${formatSize(dataset.size_bytes)}` : ""} + + + + ); +} diff --git a/DashAI/front/src/components/hub/DatasetGrid.jsx b/DashAI/front/src/components/hub/DatasetGrid.jsx new file mode 100644 index 000000000..2101a535d --- /dev/null +++ b/DashAI/front/src/components/hub/DatasetGrid.jsx @@ -0,0 +1,134 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { + Box, + CircularProgress, + InputAdornment, + TextField, + Typography, +} from "@mui/material"; +import SearchIcon from "@mui/icons-material/Search"; +import { useTranslation } from "react-i18next"; +import { searchDatasets } from "../../api/hub"; +import DatasetCard from "./DatasetCard"; + +/** + * Center panel — debounced search bar + grid of DatasetCard components. + * + * @param {string|null} sourceName - Active DatasetSource class name. + * @param {object|null} selectedDataset - Currently selected DatasetEntry. + * @param {function} onSelectDataset - Called with a DatasetEntry when a card is clicked. + */ +export default function DatasetGrid({ sourceName, selectedDataset, onSelectDataset }) { + const { t } = useTranslation(["hub"]); + const [query, setQuery] = useState(""); + const [datasets, setDatasets] = useState([]); + const [loading, setLoading] = useState(false); + const debounceRef = useRef(null); + + const fetchDatasets = useCallback( + (q) => { + if (!sourceName) return; + setLoading(true); + searchDatasets(sourceName, q, 40) + .then(setDatasets) + .catch(() => setDatasets([])) + .finally(() => setLoading(false)); + }, + [sourceName], + ); + + useEffect(() => { + setDatasets([]); + setQuery(""); + if (sourceName) fetchDatasets(""); + }, [sourceName]); + + const handleQueryChange = (e) => { + const val = e.target.value; + setQuery(val); + clearTimeout(debounceRef.current); + debounceRef.current = setTimeout(() => fetchDatasets(val), 400); + }; + + if (!sourceName) { + return ( + + + {t("hub:noSourceSelected")} + + + ); + } + + return ( + + + + + ), + }, + }} + /> + + {loading ? ( + + + + ) : datasets.length === 0 ? ( + + + {t("hub:noResults")} + + + ) : ( + + {datasets.map((ds) => ( + onSelectDataset(ds)} + /> + ))} + + )} + + ); +} diff --git a/DashAI/front/src/components/hub/HubLeftBar.jsx b/DashAI/front/src/components/hub/HubLeftBar.jsx new file mode 100644 index 000000000..ee7792675 --- /dev/null +++ b/DashAI/front/src/components/hub/HubLeftBar.jsx @@ -0,0 +1,90 @@ +import { useEffect, useState } from "react"; +import { + Box, + CircularProgress, + List, + ListItemButton, + ListItemText, + Typography, +} from "@mui/material"; +import { useTheme } from "@mui/material/styles"; +import { useTranslation } from "react-i18next"; +import { getDatasetSources } from "../../api/hub"; + +/** + * Left sidebar for the Hub module — lists available DatasetSource components. + * + * @param {string|null} selectedSource - Currently active source name. + * @param {function} onSelectSource - Called with source name when user clicks. + */ +export default function HubLeftBar({ selectedSource, onSelectSource }) { + const { t } = useTranslation(["hub"]); + const theme = useTheme(); + const [sources, setSources] = useState([]); + const [loading, setLoading] = useState(true); + + useEffect(() => { + getDatasetSources() + .then(setSources) + .catch(() => setSources([])) + .finally(() => setLoading(false)); + }, []); + + return ( + + + + {t("hub:title")} + + + + {loading ? ( + + + + ) : ( + + {sources.map((source) => ( + onSelectSource(source.name)} + sx={{ + "&.Mui-selected": { + bgcolor: "action.selected", + borderLeft: `3px solid ${theme.palette.primary.main}`, + }, + "&.Mui-selected:hover": { bgcolor: "action.selected" }, + }} + > + + + ))} + + )} + + ); +} From cc0d9ac4402746e8e803009a35a2847c4f358f17 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:33:34 -0400 Subject: [PATCH 026/361] feat: add DatasetDetail and ImportDatasetDialog components --- .../src/components/hub/DatasetDetail.jsx | 160 ++++++++++++++++++ .../components/hub/ImportDatasetDialog.jsx | 132 +++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 DashAI/front/src/components/hub/DatasetDetail.jsx create mode 100644 DashAI/front/src/components/hub/ImportDatasetDialog.jsx diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx new file mode 100644 index 000000000..ac354ce0f --- /dev/null +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -0,0 +1,160 @@ +import { useState } from "react"; +import { + Box, + Button, + Chip, + Divider, + Link, + Stack, + Typography, +} from "@mui/material"; +import OpenInNewIcon from "@mui/icons-material/OpenInNew"; +import DownloadIcon from "@mui/icons-material/Download"; +import AddIcon from "@mui/icons-material/Add"; +import { useTheme } from "@mui/material/styles"; +import { useTranslation } from "react-i18next"; +import { getDownloadUrl } from "../../api/hub"; +import ImportDatasetDialog from "./ImportDatasetDialog"; + +/** + * Right panel — detailed view of a selected Hub dataset with action buttons. + * + * @param {object|null} dataset - Selected DatasetEntry, or null if none. + * @param {string|null} sourceName - Active DatasetSource class name. + * @param {function} onImported - Called after a successful import. + */ +export default function DatasetDetail({ dataset, sourceName, onImported }) { + const { t } = useTranslation(["hub"]); + const theme = useTheme(); + const [importOpen, setImportOpen] = useState(false); + + if (!dataset) { + return ( + + + {t("hub:selectDatasetToPreview")} + + + ); + } + + const handleDownload = async () => { + try { + const url = await getDownloadUrl(sourceName, dataset.id); + window.location.href = url; + } catch { + // silently fail — source page link is still available + } + }; + + return ( + + + + {dataset.name} + + + + + + + + + {t("hub:viewOnSource")} + + + + + {dataset.description && ( + + {dataset.description} + + )} + + + + + {dataset.row_count != null && ( + + + {t("hub:rows")} + + + {dataset.row_count.toLocaleString()} + + + )} + + {dataset.tags?.length > 0 && ( + + + {t("hub:tags")} + + + {dataset.tags.map((tag) => ( + + ))} + + + )} + + + + setImportOpen(false)} + sourceName={sourceName} + dataset={dataset} + onImported={onImported} + /> + + ); +} diff --git a/DashAI/front/src/components/hub/ImportDatasetDialog.jsx b/DashAI/front/src/components/hub/ImportDatasetDialog.jsx new file mode 100644 index 000000000..2a9021087 --- /dev/null +++ b/DashAI/front/src/components/hub/ImportDatasetDialog.jsx @@ -0,0 +1,132 @@ +import { useCallback, useEffect, useState } from "react"; +import { + Box, + Button, + CircularProgress, + Dialog, + DialogActions, + DialogContent, + DialogTitle, + TextField, + Typography, +} from "@mui/material"; +import { useSnackbar } from "notistack"; +import { useTranslation } from "react-i18next"; +import { createDataset } from "../../api/datasets"; +import { importHubDataset, previewHubDataset } from "../../api/hub"; +import PreviewDataset from "../notebooks/datasetCreation/PreviewDataset"; + +/** + * Dialog that previews a Hub dataset and imports it into DashAI on confirm. + * + * @param {boolean} open - Whether the dialog is open. + * @param {function} onClose - Called when the dialog is dismissed. + * @param {string} sourceName - DatasetSource class name. + * @param {object|null} dataset - DatasetEntry to import. + * @param {function} onImported - Called with job_id after successful enqueue. + */ +export default function ImportDatasetDialog({ + open, + onClose, + sourceName, + dataset, + onImported, +}) { + const { t } = useTranslation(["hub", "common"]); + const { enqueueSnackbar } = useSnackbar(); + + const [name, setName] = useState(""); + const [previewData, setPreviewData] = useState(null); + const [previewLoading, setPreviewLoading] = useState(false); + const [previewError, setPreviewError] = useState(false); + const [columnTypes, setColumnTypes] = useState({}); + const [columnRenames, setColumnRenames] = useState({}); + const [importing, setImporting] = useState(false); + + useEffect(() => { + if (!open || !dataset || !sourceName) return; + setName(dataset.name || ""); + setPreviewData(null); + setPreviewLoading(true); + setPreviewError(false); + setColumnTypes({}); + setColumnRenames({}); + + previewHubDataset(sourceName, dataset.id, 100) + .then((data) => { + setPreviewData(data); + setColumnTypes(data.inferred_types || {}); + }) + .catch(() => setPreviewError(true)) + .finally(() => setPreviewLoading(false)); + }, [open, dataset, sourceName]); + + const handleColumnRename = useCallback((oldName, newName) => { + setColumnRenames((prev) => ({ ...prev, [oldName]: newName })); + }, []); + + const handleImport = async () => { + if (!name.trim() || !dataset) return; + setImporting(true); + try { + const created = await createDataset(name.trim()); + await importHubDataset(sourceName, dataset.id, created.id, { + inferred_types: columnTypes, + column_renames: columnRenames, + }); + enqueueSnackbar(t("hub:importSuccess"), { variant: "success" }); + onImported?.(); + onClose(); + } catch { + enqueueSnackbar(t("hub:importError"), { variant: "error" }); + } finally { + setImporting(false); + } + }; + + return ( + + {t("hub:importDataset")} + + + setName(e.target.value)} + fullWidth + /> + + + {previewLoading && ( + + + + )} + + {previewError && !previewLoading && ( + {t("hub:previewError")} + )} + + {!previewLoading && !previewError && previewData && ( + + )} + + + + + + + ); +} From 3a619b5d8768fc0a67c84ba1af524369c4811d53 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:34:15 -0400 Subject: [PATCH 027/361] feat: add Hub page, route, and nav entry --- DashAI/front/src/App.jsx | 2 + .../front/src/components/ResponsiveAppBar.jsx | 1 + DashAI/front/src/pages/hub/HubContent.jsx | 54 +++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 DashAI/front/src/pages/hub/HubContent.jsx diff --git a/DashAI/front/src/App.jsx b/DashAI/front/src/App.jsx index 7bbc3db4c..c43937f95 100644 --- a/DashAI/front/src/App.jsx +++ b/DashAI/front/src/App.jsx @@ -13,6 +13,7 @@ import PipelinesPage from "./pages/pipelines/Pipelines"; import PluginsDetails from "./pages/plugins/components/PluginsDetails"; import Generative from "./pages/generative/Generative"; import NewPipelineWrapper from "./pages/pipelines/newPipelineWrapper"; +import HubContent from "./pages/hub/HubContent"; import JobQueueWidget from "./components/jobs/JobQueueWidget"; function App() { @@ -61,6 +62,7 @@ function App() { } /> + } /> diff --git a/DashAI/front/src/components/ResponsiveAppBar.jsx b/DashAI/front/src/components/ResponsiveAppBar.jsx index 577df3c2a..8f4ba5bec 100644 --- a/DashAI/front/src/components/ResponsiveAppBar.jsx +++ b/DashAI/front/src/components/ResponsiveAppBar.jsx @@ -30,6 +30,7 @@ function ResponsiveAppBar() { { name: t("common:datasets"), to: "/app/data" }, { name: t("common:models"), to: "/app/models" }, { name: t("common:generative"), to: "/app/generative" }, + { name: t("common:hub"), to: "/app/hub" }, { name: t("common:plugins"), to: "/app/plugins/browse" }, ]; diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx new file mode 100644 index 000000000..2bdc6db76 --- /dev/null +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -0,0 +1,54 @@ +import { useState } from "react"; +import ModuleContainer from "../../components/layout/ModuleContainer"; +import LeftPanel from "../../components/threeSectionLayout/panels/LeftPanel"; +import CenterPanel from "../../components/threeSectionLayout/panels/CenterPanel"; +import RightPanel from "../../components/threeSectionLayout/panels/RightPanel"; +import { ThreePanelLayoutContext } from "../../components/threeSectionLayout/panels/ThreePanelLayoutContext"; +import { useThreePanelLayout } from "../../hooks/useThreePanelsLayout"; +import HubLeftBar from "../../components/hub/HubLeftBar"; +import DatasetGrid from "../../components/hub/DatasetGrid"; +import DatasetDetail from "../../components/hub/DatasetDetail"; + +export default function HubContent() { + const threePanelLayout = useThreePanelLayout({ storageKey: "hub" }); + const [selectedSource, setSelectedSource] = useState(null); + const [selectedDataset, setSelectedDataset] = useState(null); + + const handleSelectSource = (sourceName) => { + setSelectedSource(sourceName); + setSelectedDataset(null); + }; + + const handleImported = () => { + setSelectedDataset(null); + }; + + return ( + + + + + + + + + + + + + + + + ); +} From dc39e345f5b85dc5e3938cff21785bcf77eaf4dd Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:38:29 -0400 Subject: [PATCH 028/361] fix: resolve MultilingualString to plain string in list_sources endpoint --- DashAI/back/api/api_v1/endpoints/dataset_source.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index 811e20fdb..54f57aa5d 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -9,6 +9,7 @@ from kink import di from pydantic import BaseModel +from DashAI.back.core.utils import MultilingualString from DashAI.back.types.inf.type_inference import infer_types if TYPE_CHECKING: @@ -18,6 +19,15 @@ router = APIRouter() +def _resolve_string(value: Any, default: str) -> str: + """Return the English text from a MultilingualString, or the value itself if plain str.""" + if isinstance(value, MultilingualString): + return value.en + if isinstance(value, str): + return value + return default + + def _get_source(source_name: str, registry: "ComponentRegistry"): """Retrieve and instantiate a DatasetSource from the registry. @@ -68,8 +78,8 @@ async def list_sources( { "name": name, "type": "DatasetSource", - "display_name": str(getattr(info["class"], "DISPLAY_NAME", name)), - "description": str(getattr(info["class"], "DESCRIPTION", "")), + "display_name": _resolve_string(info.get("display_name"), name), + "description": _resolve_string(info.get("description"), ""), } for name, info in sources.items() ] From da0f26f4695bb1ca4d0342ed17c4ae1ca3e62265 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:40:06 -0400 Subject: [PATCH 029/361] fix: use dataset url field instead of authenticated download endpoint for OpenML --- DashAI/back/dataset_sources/openml_dataset_source.py | 12 ++++-------- .../back/dataset_sources/test_base_dataset_source.py | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/DashAI/back/dataset_sources/openml_dataset_source.py b/DashAI/back/dataset_sources/openml_dataset_source.py index cd27801ae..1b4aa497b 100644 --- a/DashAI/back/dataset_sources/openml_dataset_source.py +++ b/DashAI/back/dataset_sources/openml_dataset_source.py @@ -13,7 +13,6 @@ log = logging.getLogger(__name__) _OPENML_API = "https://www.openml.org/api/v1/json" -_OPENML_DATA = "https://data.openml.org/data/v1/download" def _parse_quality(qualities: list[dict], name: str) -> int | None: @@ -132,11 +131,8 @@ def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": if info_resp.status_code != 200: return pd.DataFrame() - file_id = info_resp.json()["data_set_description"]["file_id"] - file_resp = httpx.get( - f"{_OPENML_DATA}/{file_id}", - timeout=60, - ) + arff_url = info_resp.json()["data_set_description"]["url"] + file_resp = httpx.get(arff_url, timeout=60, follow_redirects=True) if file_resp.status_code != 200: return pd.DataFrame() @@ -170,9 +166,9 @@ def fetch_full(self, dataset_id: str, temp_path: str) -> tuple[str, str]: info_resp = httpx.get(f"{_OPENML_API}/data/{dataset_id}", timeout=15) info_resp.raise_for_status() - file_id = info_resp.json()["data_set_description"]["file_id"] + arff_url = info_resp.json()["data_set_description"]["url"] - file_resp = httpx.get(f"{_OPENML_DATA}/{file_id}", timeout=120) + file_resp = httpx.get(arff_url, timeout=120, follow_redirects=True) file_resp.raise_for_status() arff_text = file_resp.content.decode("utf-8", errors="replace") diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py index 63c03eb52..354dc27ac 100644 --- a/tests/back/dataset_sources/test_base_dataset_source.py +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -235,7 +235,7 @@ def test_openml_fetch_preview_returns_dataframe(): info_response = MagicMock() info_response.status_code = 200 info_response.json.return_value = { - "data_set_description": {"file_id": "22044555"} + "data_set_description": {"file_id": "22044555", "url": "https://openml.org/data/v1/download/22044555/iris.arff"} } arff_content = b"""@relation iris From d1a5e18bfb1dc28ff6fe063c25af7cc76e7059c9 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:50:55 -0400 Subject: [PATCH 030/361] feat: add pagination, OpenML descriptions/tags, and fix HuggingFace preview config discovery --- .../api/api_v1/endpoints/dataset_source.py | 11 +- .../dataset_sources/base_dataset_source.py | 4 +- .../huggingface_dataset_source.py | 30 ++++- .../dataset_sources/openml_dataset_source.py | 58 ++++++++-- DashAI/front/src/api/hub.ts | 3 +- .../front/src/components/hub/DatasetGrid.jsx | 104 ++++++++++++------ .../test_base_dataset_source.py | 29 +++-- 7 files changed, 180 insertions(+), 59 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index 54f57aa5d..e9a1e9189 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -90,6 +90,7 @@ async def search_datasets( source_name: str, q: str = Query(default="", description="Search query"), limit: int = Query(default=20, ge=1, le=100), + offset: int = Query(default=0, ge=0), registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), ) -> List[Dict[str, Any]]: """Search for datasets in a registered source. @@ -102,6 +103,8 @@ async def search_datasets( Search query string. limit : int Maximum number of results (1-100). + offset : int + Number of results to skip (for pagination). registry : ComponentRegistry Injected component registry. @@ -111,7 +114,7 @@ async def search_datasets( List of DatasetEntry dicts. """ source = _get_source(source_name, registry) - results = source.search(q, limit=limit) + results = source.search(q, limit=limit, offset=offset) return [ { "id": e.id, @@ -192,6 +195,12 @@ async def preview_dataset( detail=f"Failed to fetch preview from source: {exc}", ) from exc + if df.empty: + raise HTTPException( + status_code=status.HTTP_502_BAD_GATEWAY, + detail=f"Source returned no data for dataset '{decoded_id}'.", + ) + inferred = infer_types(df, method="DashAIPtype") sample = df.to_dict(orient="records") diff --git a/DashAI/back/dataset_sources/base_dataset_source.py b/DashAI/back/dataset_sources/base_dataset_source.py index b201a586f..d5193a00d 100644 --- a/DashAI/back/dataset_sources/base_dataset_source.py +++ b/DashAI/back/dataset_sources/base_dataset_source.py @@ -58,7 +58,7 @@ class BaseDatasetSource(ConfigObject, ABC): DESCRIPTION: Final = MultilingualString(en="", es="") @abstractmethod - def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEntry]: + def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) -> list[DatasetEntry]: """Return datasets matching a query string. Parameters @@ -67,6 +67,8 @@ def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEnt Free-text search string. limit : int, optional Maximum number of results, by default 20. + offset : int, optional + Number of results to skip (for pagination), by default 0. **filters : Any Source-specific filter keyword arguments. diff --git a/DashAI/back/dataset_sources/huggingface_dataset_source.py b/DashAI/back/dataset_sources/huggingface_dataset_source.py index 256d38deb..4731d96b7 100644 --- a/DashAI/back/dataset_sources/huggingface_dataset_source.py +++ b/DashAI/back/dataset_sources/huggingface_dataset_source.py @@ -12,6 +12,7 @@ log = logging.getLogger(__name__) _HF_API = "https://huggingface.co/api/datasets" +_HF_SPLITS_API = "https://datasets-server.huggingface.co/splits" _HF_ROWS_API = "https://datasets-server.huggingface.co/first-rows" @@ -31,7 +32,7 @@ class HuggingFaceDatasetSource(BaseDatasetSource): es="Navega e importa datasets públicos desde HuggingFace Hub.", ) - def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEntry]: + def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) -> list[DatasetEntry]: """Return public HuggingFace datasets matching a query. Parameters @@ -40,6 +41,8 @@ def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEnt Search string passed to the HuggingFace datasets API. limit : int, optional Maximum number of results, by default 20. + offset : int, optional + Number of results to skip (for pagination), by default 0. **filters : Any Unused; reserved for future tag/task filters. @@ -51,7 +54,7 @@ def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEnt try: resp = httpx.get( _HF_API, - params={"search": query, "limit": limit, "full": "True"}, + params={"search": query, "limit": limit, "offset": offset, "full": "True"}, timeout=15, ) if resp.status_code != 200: @@ -95,12 +98,31 @@ def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": import pandas as pd try: + # Discover available configs and splits — don't assume "default"/"train" + splits_resp = httpx.get( + _HF_SPLITS_API, + params={"dataset": dataset_id}, + timeout=20, + ) + if splits_resp.status_code != 200: + log.warning( + "HuggingFace splits API returned %s for %s", + splits_resp.status_code, + dataset_id, + ) + return pd.DataFrame() + + splits = splits_resp.json().get("splits", []) + if not splits: + return pd.DataFrame() + + first = splits[0] resp = httpx.get( _HF_ROWS_API, params={ "dataset": dataset_id, - "config": "default", - "split": "train", + "config": first["config"], + "split": first["split"], "offset": 0, "length": min(n_rows, 100), }, diff --git a/DashAI/back/dataset_sources/openml_dataset_source.py b/DashAI/back/dataset_sources/openml_dataset_source.py index 1b4aa497b..e844d9d30 100644 --- a/DashAI/back/dataset_sources/openml_dataset_source.py +++ b/DashAI/back/dataset_sources/openml_dataset_source.py @@ -3,6 +3,7 @@ import io import logging import os +from concurrent.futures import ThreadPoolExecutor from typing import Any, Final import httpx @@ -39,6 +40,33 @@ def _parse_quality(qualities: list[dict], name: str) -> int | None: return None +def _fetch_openml_details(dataset_id: str) -> dict: + """Fetch description and tags for a single OpenML dataset. + + Parameters + ---------- + dataset_id : str + OpenML dataset ID (integer as string). + + Returns + ------- + dict + ``{"description": str, "tags": list[str]}`` — empty strings/lists on error. + """ + try: + resp = httpx.get(f"{_OPENML_API}/data/{dataset_id}", timeout=10) + if resp.status_code == 200: + desc = resp.json()["data_set_description"] + tag_raw = desc.get("tag", []) + return { + "description": desc.get("description") or "", + "tags": [tag_raw] if isinstance(tag_raw, str) else (tag_raw or []), + } + except Exception: + log.debug("Could not fetch details for OpenML dataset %s", dataset_id) + return {"description": "", "tags": []} + + class OpenMLDatasetSource(BaseDatasetSource): """Dataset source that fetches public datasets from OpenML. @@ -54,7 +82,7 @@ class OpenMLDatasetSource(BaseDatasetSource): es="Navega e importa datasets públicos desde OpenML.", ) - def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEntry]: + def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) -> list[DatasetEntry]: """Return active OpenML datasets matching a name query. Parameters @@ -63,37 +91,49 @@ def search(self, query: str, limit: int = 20, **filters: Any) -> list[DatasetEnt Dataset name search string. limit : int, optional Maximum number of results, by default 20. + offset : int, optional + Number of results to skip (for pagination), by default 0. **filters : Any Unused; reserved for future filters. Returns ------- list[DatasetEntry] - Matching datasets. Returns empty list on API error. + Matching datasets with descriptions and tags fetched in parallel. + Returns empty list on API error. """ try: resp = httpx.get( f"{_OPENML_API}/data/list", - params={"data_name": query, "limit": limit, "status": "active"}, + params={"data_name": query, "limit": limit, "offset": offset, "status": "active"}, timeout=15, ) if resp.status_code != 200: log.warning("OpenML API returned %s", resp.status_code) return [] - items = resp.json().get("data", {}).get("dataset", []) + raw_items = resp.json().get("data", {}).get("dataset", []) + if not raw_items: + return [] + + # Fetch descriptions and tags for all results in parallel + with ThreadPoolExecutor(max_workers=min(len(raw_items), 10)) as executor: + detail_futures = [ + executor.submit(_fetch_openml_details, str(item["did"])) + for item in raw_items + ] + details_list = [f.result() for f in detail_futures] + entries = [] - for item in items: + for item, details in zip(raw_items, details_list): did = str(item.get("did", "")) qualities = item.get("quality", []) - tag_raw = item.get("tag", []) - tags = [tag_raw] if isinstance(tag_raw, str) else tag_raw entries.append( DatasetEntry( id=did, name=item.get("name", ""), - description=item.get("description") or "", - tags=tags, + description=details["description"], + tags=details["tags"], size_bytes=None, row_count=_parse_quality(qualities, "NumberOfInstances"), url=f"https://www.openml.org/d/{did}", diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index 40c108836..11eb522d1 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -35,10 +35,11 @@ export const searchDatasets = async ( sourceName: string, query: string, limit = 20, + offset = 0, ): Promise => { const response = await api.get( `${hubEndpoint}/${sourceName}/search`, - { params: { q: query, limit } }, + { params: { q: query, limit, offset } }, ); return response.data; }; diff --git a/DashAI/front/src/components/hub/DatasetGrid.jsx b/DashAI/front/src/components/hub/DatasetGrid.jsx index 2101a535d..7c4924460 100644 --- a/DashAI/front/src/components/hub/DatasetGrid.jsx +++ b/DashAI/front/src/components/hub/DatasetGrid.jsx @@ -1,6 +1,7 @@ import { useCallback, useEffect, useRef, useState } from "react"; import { Box, + Button, CircularProgress, InputAdornment, TextField, @@ -11,43 +12,68 @@ import { useTranslation } from "react-i18next"; import { searchDatasets } from "../../api/hub"; import DatasetCard from "./DatasetCard"; +const PAGE_SIZE = 20; + /** - * Center panel — debounced search bar + grid of DatasetCard components. + * Center panel — debounced search bar + paginated grid of DatasetCard components. * * @param {string|null} sourceName - Active DatasetSource class name. * @param {object|null} selectedDataset - Currently selected DatasetEntry. * @param {function} onSelectDataset - Called with a DatasetEntry when a card is clicked. */ export default function DatasetGrid({ sourceName, selectedDataset, onSelectDataset }) { - const { t } = useTranslation(["hub"]); + const { t } = useTranslation(["hub", "common"]); const [query, setQuery] = useState(""); const [datasets, setDatasets] = useState([]); + const [offset, setOffset] = useState(0); + const [hasMore, setHasMore] = useState(false); const [loading, setLoading] = useState(false); + const [loadingMore, setLoadingMore] = useState(false); const debounceRef = useRef(null); - const fetchDatasets = useCallback( - (q) => { + const loadPage = useCallback( + (q, pageOffset, append) => { if (!sourceName) return; - setLoading(true); - searchDatasets(sourceName, q, 40) - .then(setDatasets) - .catch(() => setDatasets([])) - .finally(() => setLoading(false)); + if (append) setLoadingMore(true); + else setLoading(true); + + searchDatasets(sourceName, q, PAGE_SIZE, pageOffset) + .then((results) => { + setDatasets((prev) => (append ? [...prev, ...results] : results)); + setHasMore(results.length === PAGE_SIZE); + }) + .catch(() => { + if (!append) setDatasets([]); + setHasMore(false); + }) + .finally(() => { + if (append) setLoadingMore(false); + else setLoading(false); + }); }, [sourceName], ); useEffect(() => { setDatasets([]); + setOffset(0); setQuery(""); - if (sourceName) fetchDatasets(""); + setHasMore(false); + if (sourceName) loadPage("", 0, false); }, [sourceName]); const handleQueryChange = (e) => { const val = e.target.value; setQuery(val); + setOffset(0); clearTimeout(debounceRef.current); - debounceRef.current = setTimeout(() => fetchDatasets(val), 400); + debounceRef.current = setTimeout(() => loadPage(val, 0, false), 400); + }; + + const handleLoadMore = () => { + const next = offset + PAGE_SIZE; + setOffset(next); + loadPage(query, next, true); }; if (!sourceName) { @@ -105,28 +131,40 @@ export default function DatasetGrid({ sourceName, selectedDataset, onSelectDatas ) : ( - - {datasets.map((ds) => ( - onSelectDataset(ds)} - /> - ))} + + + {datasets.map((ds) => ( + onSelectDataset(ds)} + /> + ))} + + + {hasMore && ( + + {loadingMore ? ( + + ) : ( + + )} + + )} )} diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py index 354dc27ac..08ad85ac0 100644 --- a/tests/back/dataset_sources/test_base_dataset_source.py +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -156,9 +156,15 @@ def test_hf_get_download_url(): def test_hf_fetch_preview_returns_dataframe(): import pandas as pd - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = { + splits_response = MagicMock() + splits_response.status_code = 200 + splits_response.json.return_value = { + "splits": [{"dataset": "stanfordnlp/imdb", "config": "plain_text", "split": "train"}] + } + + rows_response = MagicMock() + rows_response.status_code = 200 + rows_response.json.return_value = { "features": [{"name": "text"}, {"name": "label"}], "rows": [ {"row": {"text": "good movie", "label": 1}}, @@ -166,7 +172,7 @@ def test_hf_fetch_preview_returns_dataframe(): ], } - with patch("httpx.get", return_value=mock_response): + with patch("httpx.get", side_effect=[splits_response, rows_response]): source = HuggingFaceDatasetSource() df = source.fetch_preview("stanfordnlp/imdb", n_rows=2) @@ -183,24 +189,25 @@ def test_openml_source_has_correct_type(): def test_openml_search_returns_dataset_entries(): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = { + mock_list = MagicMock() + mock_list.status_code = 200 + mock_list.json.return_value = { "data": { "dataset": [ { "did": 61, "name": "iris", - "description": "The Iris dataset", - "tag": ["study_14", "uci"], "file_id": 22044555, "quality": [{"name": "NumberOfInstances", "value": "150"}], } ] } } + mock_details = {"description": "The Iris dataset", "tags": ["study_14", "uci"]} - with patch("httpx.get", return_value=mock_response): + with patch("httpx.get", return_value=mock_list), \ + patch("DashAI.back.dataset_sources.openml_dataset_source._fetch_openml_details", + return_value=mock_details): source = OpenMLDatasetSource() results = source.search("iris", limit=5) @@ -208,6 +215,8 @@ def test_openml_search_returns_dataset_entries(): assert results[0].id == "61" assert results[0].name == "iris" assert results[0].row_count == 150 + assert results[0].description == "The Iris dataset" + assert "study_14" in results[0].tags assert results[0].source == "OpenMLDatasetSource" assert results[0].url == "https://www.openml.org/d/61" From b1c5775d8ba914e081092fa2872e6b9a1330fe93 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:56:12 -0400 Subject: [PATCH 031/361] feat: add lazy dataset info endpoint and fix OpenML fetch overhead Add GET /info endpoint per source that fetches description+tags for a single dataset on demand. OpenML search now returns empty description/tags; DatasetDetail fetches details lazily via getDatasetInfo when a dataset is selected, reducing HTTP calls from 20/page to 1/click. Tags section also updated to prefer lazily-fetched extraInfo over search-time data. --- .../api/api_v1/endpoints/dataset_source.py | 34 +++++++++++++++ .../dataset_sources/base_dataset_source.py | 19 +++++++++ .../dataset_sources/openml_dataset_source.py | 42 +++++++++++++------ DashAI/front/src/api/hub.ts | 11 +++++ .../src/components/hub/DatasetDetail.jsx | 24 ++++++++--- .../test_base_dataset_source.py | 10 ++--- 6 files changed, 115 insertions(+), 25 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index e9a1e9189..f9db41756 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -130,6 +130,40 @@ async def search_datasets( ] +@router.get("/{source_name}/{dataset_id:path}/info") +async def get_dataset_info( + source_name: str, + dataset_id: str, + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), +) -> Dict[str, Any]: + """Return full metadata for a single dataset (description, tags, etc.). + + Parameters + ---------- + source_name : str + Registered DatasetSource class name. + dataset_id : str + Source-specific dataset identifier (URL-encoded). + registry : ComponentRegistry + Injected component registry. + + Returns + ------- + dict + DatasetEntry fields, or empty dict if the source has no enrichment. + """ + source = _get_source(source_name, registry) + decoded_id = unquote(dataset_id) + entry = source.get_info(decoded_id) + if entry is None: + return {} + return { + "id": entry.id, + "description": entry.description, + "tags": entry.tags, + } + + @router.get("/{source_name}/{dataset_id:path}/download-url") async def get_download_url( source_name: str, diff --git a/DashAI/back/dataset_sources/base_dataset_source.py b/DashAI/back/dataset_sources/base_dataset_source.py index d5193a00d..03901f712 100644 --- a/DashAI/back/dataset_sources/base_dataset_source.py +++ b/DashAI/back/dataset_sources/base_dataset_source.py @@ -79,6 +79,25 @@ def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) - """ ... + def get_info(self, dataset_id: str) -> DatasetEntry | None: + """Return full metadata for a single dataset, including description and tags. + + The default implementation returns None (no enrichment). + Sources that require extra requests to retrieve description/tags + should override this method. + + Parameters + ---------- + dataset_id : str + Source-specific dataset identifier. + + Returns + ------- + DatasetEntry or None + Full metadata entry, or None if not available. + """ + return None + @abstractmethod def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": """Download a sample of rows without fetching the full dataset. diff --git a/DashAI/back/dataset_sources/openml_dataset_source.py b/DashAI/back/dataset_sources/openml_dataset_source.py index e844d9d30..fe322b1ab 100644 --- a/DashAI/back/dataset_sources/openml_dataset_source.py +++ b/DashAI/back/dataset_sources/openml_dataset_source.py @@ -3,7 +3,6 @@ import io import logging import os -from concurrent.futures import ThreadPoolExecutor from typing import Any, Final import httpx @@ -116,24 +115,16 @@ def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) - if not raw_items: return [] - # Fetch descriptions and tags for all results in parallel - with ThreadPoolExecutor(max_workers=min(len(raw_items), 10)) as executor: - detail_futures = [ - executor.submit(_fetch_openml_details, str(item["did"])) - for item in raw_items - ] - details_list = [f.result() for f in detail_futures] - entries = [] - for item, details in zip(raw_items, details_list): + for item in raw_items: did = str(item.get("did", "")) qualities = item.get("quality", []) entries.append( DatasetEntry( id=did, name=item.get("name", ""), - description=details["description"], - tags=details["tags"], + description="", + tags=[], size_bytes=None, row_count=_parse_quality(qualities, "NumberOfInstances"), url=f"https://www.openml.org/d/{did}", @@ -145,6 +136,33 @@ def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) - log.exception("Error searching OpenML datasets") return [] + def get_info(self, dataset_id: str) -> "DatasetEntry | None": + """Return full metadata for a single OpenML dataset including description and tags. + + Parameters + ---------- + dataset_id : str + OpenML dataset ID (integer as string). + + Returns + ------- + DatasetEntry or None + Full metadata, or None on error. + """ + details = _fetch_openml_details(dataset_id) + if not details["description"] and not details["tags"]: + return None + return DatasetEntry( + id=dataset_id, + name="", + description=details["description"], + tags=details["tags"], + size_bytes=None, + row_count=None, + url=f"https://www.openml.org/d/{dataset_id}", + source=self.__class__.__name__, + ) + def fetch_preview(self, dataset_id: str, n_rows: int = 100) -> "pd.DataFrame": """Download and parse sample rows from an OpenML dataset ARFF file. diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index 11eb522d1..358dcd330 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -44,6 +44,17 @@ export const searchDatasets = async ( return response.data; }; +export const getDatasetInfo = async ( + sourceName: string, + datasetId: string, +): Promise<{ id?: string; description?: string; tags?: string[] }> => { + const encodedId = encodeURIComponent(datasetId); + const response = await api.get<{ id?: string; description?: string; tags?: string[] }>( + `${hubEndpoint}/${sourceName}/${encodedId}/info`, + ); + return response.data; +}; + export const getDownloadUrl = async ( sourceName: string, datasetId: string, diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index ac354ce0f..1fec5edb4 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -1,4 +1,4 @@ -import { useState } from "react"; +import { useEffect, useState } from "react"; import { Box, Button, @@ -13,7 +13,7 @@ import DownloadIcon from "@mui/icons-material/Download"; import AddIcon from "@mui/icons-material/Add"; import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; -import { getDownloadUrl } from "../../api/hub"; +import { getDatasetInfo, getDownloadUrl } from "../../api/hub"; import ImportDatasetDialog from "./ImportDatasetDialog"; /** @@ -27,6 +27,18 @@ export default function DatasetDetail({ dataset, sourceName, onImported }) { const { t } = useTranslation(["hub"]); const theme = useTheme(); const [importOpen, setImportOpen] = useState(false); + const [extraInfo, setExtraInfo] = useState(null); + + useEffect(() => { + if (!dataset || !sourceName) { + setExtraInfo(null); + return; + } + setExtraInfo(null); + getDatasetInfo(sourceName, dataset.id) + .then((info) => setExtraInfo(info)) + .catch(() => setExtraInfo({})); + }, [dataset?.id, sourceName]); if (!dataset) { return ( @@ -108,9 +120,9 @@ export default function DatasetDetail({ dataset, sourceName, onImported }) { - {dataset.description && ( + {(extraInfo?.description || dataset.description) && ( - {dataset.description} + {extraInfo?.description || dataset.description} )} @@ -128,7 +140,7 @@ export default function DatasetDetail({ dataset, sourceName, onImported }) { )} - {dataset.tags?.length > 0 && ( + {((extraInfo?.tags ?? dataset.tags)?.length > 0) && ( - {dataset.tags.map((tag) => ( + {(extraInfo?.tags ?? dataset.tags).map((tag) => ( ))} diff --git a/tests/back/dataset_sources/test_base_dataset_source.py b/tests/back/dataset_sources/test_base_dataset_source.py index 08ad85ac0..97cedcd87 100644 --- a/tests/back/dataset_sources/test_base_dataset_source.py +++ b/tests/back/dataset_sources/test_base_dataset_source.py @@ -203,11 +203,7 @@ def test_openml_search_returns_dataset_entries(): ] } } - mock_details = {"description": "The Iris dataset", "tags": ["study_14", "uci"]} - - with patch("httpx.get", return_value=mock_list), \ - patch("DashAI.back.dataset_sources.openml_dataset_source._fetch_openml_details", - return_value=mock_details): + with patch("httpx.get", return_value=mock_list): source = OpenMLDatasetSource() results = source.search("iris", limit=5) @@ -215,8 +211,8 @@ def test_openml_search_returns_dataset_entries(): assert results[0].id == "61" assert results[0].name == "iris" assert results[0].row_count == 150 - assert results[0].description == "The Iris dataset" - assert "study_14" in results[0].tags + assert results[0].description == "" + assert results[0].tags == [] assert results[0].source == "OpenMLDatasetSource" assert results[0].url == "https://www.openml.org/d/61" From 00a514b3fea1995f868c3133dbd5e012d4863e55 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 15:57:16 -0400 Subject: [PATCH 032/361] fix: pass separator param to CSVDataLoader when importing hub datasets --- DashAI/back/job/dataset_job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index f948b95a7..47fbcbc70 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -191,10 +191,11 @@ def run( file_path_hub, dataloader_name, ) + hub_loader_params = {"separator": ","} if dataloader_name == "CSVDataLoader" else {} new_dataset = dataloader.load_data( filepath_or_buffer=file_path_hub, temp_path=hub_temp, - params={}, + params=hub_loader_params, n_sample=None, ) else: From a5c6cf73b38e8802a9d8fc9333959c47e58df3e8 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 5 May 2026 16:12:54 -0400 Subject: [PATCH 033/361] feat: add COMPATIBLE_COMPONENTS to DatasetSource and multi-step import dialog Each DatasetSource now declares COMPATIBLE_COMPONENTS (list of DataLoader class names). The import dialog becomes a 3-step stepper: format selection (radio picker for multi-format sources, auto-selected for single), dataloader parameter form (driven by the component schema), and preview+confirm. The selected dataloader and its user-configured params are forwarded to the import job, replacing the hardcoded separator workaround. --- .../api/api_v1/endpoints/dataset_source.py | 1 + .../dataset_sources/base_dataset_source.py | 1 + .../huggingface_dataset_source.py | 1 + .../dataset_sources/openml_dataset_source.py | 1 + DashAI/back/job/dataset_job.py | 2 +- DashAI/front/src/api/hub.ts | 15 ++ .../src/components/hub/DatasetDetail.jsx | 3 +- .../front/src/components/hub/HubLeftBar.jsx | 2 +- .../components/hub/ImportDatasetDialog.jsx | 231 +++++++++++++++--- DashAI/front/src/pages/hub/HubContent.jsx | 13 +- .../front/src/utils/i18n/locales/en/hub.json | 7 +- .../front/src/utils/i18n/locales/es/hub.json | 7 +- 12 files changed, 241 insertions(+), 43 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index f9db41756..ea860149c 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -80,6 +80,7 @@ async def list_sources( "type": "DatasetSource", "display_name": _resolve_string(info.get("display_name"), name), "description": _resolve_string(info.get("description"), ""), + "compatible_components": info["class"].COMPATIBLE_COMPONENTS, } for name, info in sources.items() ] diff --git a/DashAI/back/dataset_sources/base_dataset_source.py b/DashAI/back/dataset_sources/base_dataset_source.py index 03901f712..3703f38cd 100644 --- a/DashAI/back/dataset_sources/base_dataset_source.py +++ b/DashAI/back/dataset_sources/base_dataset_source.py @@ -56,6 +56,7 @@ class BaseDatasetSource(ConfigObject, ABC): TYPE: Final[str] = "DatasetSource" DISPLAY_NAME: Final = MultilingualString(en="", es="") DESCRIPTION: Final = MultilingualString(en="", es="") + COMPATIBLE_COMPONENTS: Final[list[str]] = [] @abstractmethod def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) -> list[DatasetEntry]: diff --git a/DashAI/back/dataset_sources/huggingface_dataset_source.py b/DashAI/back/dataset_sources/huggingface_dataset_source.py index 4731d96b7..27265467c 100644 --- a/DashAI/back/dataset_sources/huggingface_dataset_source.py +++ b/DashAI/back/dataset_sources/huggingface_dataset_source.py @@ -31,6 +31,7 @@ class HuggingFaceDatasetSource(BaseDatasetSource): en="Browse and import public datasets from HuggingFace Hub.", es="Navega e importa datasets públicos desde HuggingFace Hub.", ) + COMPATIBLE_COMPONENTS: Final = ["CSVDataLoader"] def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) -> list[DatasetEntry]: """Return public HuggingFace datasets matching a query. diff --git a/DashAI/back/dataset_sources/openml_dataset_source.py b/DashAI/back/dataset_sources/openml_dataset_source.py index fe322b1ab..3c79694ab 100644 --- a/DashAI/back/dataset_sources/openml_dataset_source.py +++ b/DashAI/back/dataset_sources/openml_dataset_source.py @@ -80,6 +80,7 @@ class OpenMLDatasetSource(BaseDatasetSource): en="Browse and import public datasets from OpenML.", es="Navega e importa datasets públicos desde OpenML.", ) + COMPATIBLE_COMPONENTS: Final = ["CSVDataLoader"] def search(self, query: str, limit: int = 20, offset: int = 0, **filters: Any) -> list[DatasetEntry]: """Return active OpenML datasets matching a name query. diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 47fbcbc70..531c598e9 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -191,7 +191,7 @@ def run( file_path_hub, dataloader_name, ) - hub_loader_params = {"separator": ","} if dataloader_name == "CSVDataLoader" else {} + hub_loader_params = params.get("dataloader_params", {}) new_dataset = dataloader.load_data( filepath_or_buffer=file_path_hub, temp_path=hub_temp, diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index 358dcd330..7c393c2ae 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -7,6 +7,14 @@ export interface DatasetSourceInfo { type: string; display_name: string; description: string; + compatible_components: string[]; +} + +export interface ComponentInfo { + name: string; + display_name: string; + description: string; + schema: Record; } export interface DatasetEntry { @@ -92,3 +100,10 @@ export const importHubDataset = async ( ); return response.data; }; + +export const getComponentInfo = async ( + componentName: string, +): Promise => { + const response = await api.get(`/v1/components/${componentName}/`); + return response.data; +}; diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index 1fec5edb4..0fbead29f 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -23,7 +23,7 @@ import ImportDatasetDialog from "./ImportDatasetDialog"; * @param {string|null} sourceName - Active DatasetSource class name. * @param {function} onImported - Called after a successful import. */ -export default function DatasetDetail({ dataset, sourceName, onImported }) { +export default function DatasetDetail({ dataset, sourceName, compatibleComponents = [], onImported }) { const { t } = useTranslation(["hub"]); const theme = useTheme(); const [importOpen, setImportOpen] = useState(false); @@ -165,6 +165,7 @@ export default function DatasetDetail({ dataset, sourceName, onImported }) { onClose={() => setImportOpen(false)} sourceName={sourceName} dataset={dataset} + compatibleComponents={compatibleComponents} onImported={onImported} /> diff --git a/DashAI/front/src/components/hub/HubLeftBar.jsx b/DashAI/front/src/components/hub/HubLeftBar.jsx index ee7792675..25b2f9c47 100644 --- a/DashAI/front/src/components/hub/HubLeftBar.jsx +++ b/DashAI/front/src/components/hub/HubLeftBar.jsx @@ -64,7 +64,7 @@ export default function HubLeftBar({ selectedSource, onSelectSource }) { onSelectSource(source.name)} + onClick={() => onSelectSource(source)} sx={{ "&.Mui-selected": { bgcolor: "action.selected", diff --git a/DashAI/front/src/components/hub/ImportDatasetDialog.jsx b/DashAI/front/src/components/hub/ImportDatasetDialog.jsx index 2a9021087..c37fe8c85 100644 --- a/DashAI/front/src/components/hub/ImportDatasetDialog.jsx +++ b/DashAI/front/src/components/hub/ImportDatasetDialog.jsx @@ -1,4 +1,4 @@ -import { useCallback, useEffect, useState } from "react"; +import { useCallback, useEffect, useRef, useState } from "react"; import { Box, Button, @@ -7,33 +7,53 @@ import { DialogActions, DialogContent, DialogTitle, + FormControl, + FormControlLabel, + Radio, + RadioGroup, + Step, + StepLabel, + Stepper, TextField, Typography, } from "@mui/material"; import { useSnackbar } from "notistack"; import { useTranslation } from "react-i18next"; import { createDataset } from "../../api/datasets"; -import { importHubDataset, previewHubDataset } from "../../api/hub"; +import { getComponentInfo, importHubDataset, previewHubDataset } from "../../api/hub"; +import ParameterForm from "../configurableObject/ParameterForm"; import PreviewDataset from "../notebooks/datasetCreation/PreviewDataset"; +const STEPS = ["hub:stepFormat", "hub:stepParameters", "hub:stepPreview"]; + /** - * Dialog that previews a Hub dataset and imports it into DashAI on confirm. + * Dialog that walks the user through format selection, dataloader parameter + * configuration, dataset preview, and final import into DashAI. * * @param {boolean} open - Whether the dialog is open. * @param {function} onClose - Called when the dialog is dismissed. * @param {string} sourceName - DatasetSource class name. * @param {object|null} dataset - DatasetEntry to import. - * @param {function} onImported - Called with job_id after successful enqueue. + * @param {string[]} compatibleComponents - DataLoader class names compatible with this source. + * @param {function} onImported - Called after successful import. */ export default function ImportDatasetDialog({ open, onClose, sourceName, dataset, + compatibleComponents = [], onImported, }) { const { t } = useTranslation(["hub", "common"]); const { enqueueSnackbar } = useSnackbar(); + const paramsFormRef = useRef(null); + + const [step, setStep] = useState(0); + const [selectedLoader, setSelectedLoader] = useState(null); + const [loaderInfos, setLoaderInfos] = useState({}); + const [loadingInfos, setLoadingInfos] = useState(false); + const [loaderParams, setLoaderParams] = useState({}); const [name, setName] = useState(""); const [previewData, setPreviewData] = useState(null); @@ -46,12 +66,32 @@ export default function ImportDatasetDialog({ useEffect(() => { if (!open || !dataset || !sourceName) return; setName(dataset.name || ""); + setStep(0); + setSelectedLoader(compatibleComponents.length === 1 ? compatibleComponents[0] : null); + setLoaderParams({}); setPreviewData(null); - setPreviewLoading(true); setPreviewError(false); setColumnTypes({}); setColumnRenames({}); + if (!compatibleComponents.length) return; + setLoadingInfos(true); + Promise.all(compatibleComponents.map(getComponentInfo)) + .then((infos) => { + const map = {}; + infos.forEach((info) => { + map[info.name] = info; + }); + setLoaderInfos(map); + }) + .catch(() => {}) + .finally(() => setLoadingInfos(false)); + }, [open, dataset, sourceName, compatibleComponents]); + + const fetchPreview = useCallback(() => { + setPreviewData(null); + setPreviewLoading(true); + setPreviewError(false); previewHubDataset(sourceName, dataset.id, 100) .then((data) => { setPreviewData(data); @@ -59,18 +99,44 @@ export default function ImportDatasetDialog({ }) .catch(() => setPreviewError(true)) .finally(() => setPreviewLoading(false)); - }, [open, dataset, sourceName]); + }, [sourceName, dataset]); const handleColumnRename = useCallback((oldName, newName) => { setColumnRenames((prev) => ({ ...prev, [oldName]: newName })); }, []); + const handleParamsSubmit = useCallback( + (values) => { + setLoaderParams(values); + fetchPreview(); + setStep(2); + }, + [fetchPreview], + ); + + const handleNext = () => { + if (step === 0) { + setStep(1); + } else if (step === 1) { + if (paramsFormRef.current) { + paramsFormRef.current.handleSubmit(); + } else { + fetchPreview(); + setStep(2); + } + } + }; + + const handleBack = () => setStep((s) => Math.max(0, s - 1)); + const handleImport = async () => { if (!name.trim() || !dataset) return; setImporting(true); try { const created = await createDataset(name.trim()); await importHubDataset(sourceName, dataset.id, created.id, { + dataloader: selectedLoader, + dataloader_params: loaderParams, inferred_types: columnTypes, column_renames: columnRenames, }); @@ -84,48 +150,147 @@ export default function ImportDatasetDialog({ } }; + const selectedLoaderInfo = selectedLoader ? loaderInfos[selectedLoader] : null; + return ( {t("hub:importDataset")} - - setName(e.target.value)} - fullWidth - /> - - - {previewLoading && ( - - + + {STEPS.map((key) => ( + + {t(key)} + + ))} + + + {/* Step 0: Format selection */} + {step === 0 && ( + + {loadingInfos ? ( + + + + ) : ( + <> + + {t("hub:selectFormatDescription")} + + + setSelectedLoader(e.target.value)} + > + {compatibleComponents.map((loaderName) => { + const info = loaderInfos[loaderName]; + return ( + } + label={ + + + {info?.display_name || loaderName} + + {info?.description && ( + + {info.description} + + )} + + } + /> + ); + })} + + + + )} )} - {previewError && !previewLoading && ( - {t("hub:previewError")} + {/* Step 1: Dataloader parameters */} + {step === 1 && ( + + {!selectedLoaderInfo ? ( + + + + ) : selectedLoaderInfo.schema ? ( + + ) : ( + + {t("hub:noParameters")} + + )} + )} - {!previewLoading && !previewError && previewData && ( - + {/* Step 2: Preview and confirm */} + {step === 2 && ( + + + setName(e.target.value)} + fullWidth + /> + + + {previewLoading && ( + + + + )} + + {previewError && !previewLoading && ( + {t("hub:previewError")} + )} + + {!previewLoading && !previewError && previewData && ( + + )} + )} + - + {step > 0 && ( + + )} + {step < 2 ? ( + + ) : ( + + )} ); diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index 2bdc6db76..6516ddcf5 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -14,8 +14,10 @@ export default function HubContent() { const [selectedSource, setSelectedSource] = useState(null); const [selectedDataset, setSelectedDataset] = useState(null); - const handleSelectSource = (sourceName) => { - setSelectedSource(sourceName); + const sourceName = selectedSource?.name ?? null; + + const handleSelectSource = (source) => { + setSelectedSource(source); setSelectedDataset(null); }; @@ -28,14 +30,14 @@ export default function HubContent() { @@ -44,7 +46,8 @@ export default function HubContent() { diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json index bc591198b..000fe580b 100644 --- a/DashAI/front/src/utils/i18n/locales/en/hub.json +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -18,5 +18,10 @@ "importSuccess": "Dataset imported successfully.", "importError": "Failed to import dataset.", "previewError": "Failed to load dataset preview.", - "selectDatasetToPreview": "Select a dataset to see details." + "selectDatasetToPreview": "Select a dataset to see details.", + "stepFormat": "Format", + "stepParameters": "Parameters", + "stepPreview": "Preview", + "selectFormatDescription": "Select the file format that matches this dataset.", + "noParameters": "No additional parameters required." } diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json index 3a51ee3c0..f22103454 100644 --- a/DashAI/front/src/utils/i18n/locales/es/hub.json +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -18,5 +18,10 @@ "importSuccess": "Dataset importado exitosamente.", "importError": "Error al importar el dataset.", "previewError": "Error al cargar la vista previa.", - "selectDatasetToPreview": "Selecciona un dataset para ver los detalles." + "selectDatasetToPreview": "Selecciona un dataset para ver los detalles.", + "stepFormat": "Formato", + "stepParameters": "Parámetros", + "stepPreview": "Vista previa", + "selectFormatDescription": "Selecciona el formato del archivo que corresponde a este dataset.", + "noParameters": "No se requieren parámetros adicionales." } From dc32854654c75d5e6658b2f45352ccacc6d79d4f Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 10:02:46 -0400 Subject: [PATCH 034/361] refactor: simplify DatasetDetail to fire onStartImport instead of managing dialog --- .../src/components/hub/DatasetDetail.jsx | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index 0fbead29f..fd3e103d7 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -14,19 +14,17 @@ import AddIcon from "@mui/icons-material/Add"; import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; import { getDatasetInfo, getDownloadUrl } from "../../api/hub"; -import ImportDatasetDialog from "./ImportDatasetDialog"; /** * Right panel — detailed view of a selected Hub dataset with action buttons. * * @param {object|null} dataset - Selected DatasetEntry, or null if none. * @param {string|null} sourceName - Active DatasetSource class name. - * @param {function} onImported - Called after a successful import. + * @param {function} onStartImport - Called when user clicks "Add to DashAI". */ -export default function DatasetDetail({ dataset, sourceName, compatibleComponents = [], onImported }) { +export default function DatasetDetail({ dataset, sourceName, onStartImport }) { const { t } = useTranslation(["hub"]); const theme = useTheme(); - const [importOpen, setImportOpen] = useState(false); const [extraInfo, setExtraInfo] = useState(null); useEffect(() => { @@ -94,7 +92,7 @@ export default function DatasetDetail({ dataset, sourceName, compatibleComponent variant="contained" size="small" startIcon={} - onClick={() => setImportOpen(true)} + onClick={() => onStartImport?.()} > {t("hub:addToDashAI")} @@ -140,7 +138,7 @@ export default function DatasetDetail({ dataset, sourceName, compatibleComponent )} - {((extraInfo?.tags ?? dataset.tags)?.length > 0) && ( + {(extraInfo?.tags ?? dataset.tags)?.length > 0 && ( - - setImportOpen(false)} - sourceName={sourceName} - dataset={dataset} - compatibleComponents={compatibleComponents} - onImported={onImported} - /> ); } From a3c7b8161ba4c1eaeb89901a7d63dbc995dc52cc Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 10:02:52 -0400 Subject: [PATCH 035/361] feat: replace ImportDatasetDialog with HubImportPanel two-step flow --- .../src/components/hub/HubImportPanel.jsx | 357 ++++++++++++++++++ .../components/hub/ImportDatasetDialog.jsx | 297 --------------- .../front/src/utils/i18n/locales/en/hub.json | 13 +- .../front/src/utils/i18n/locales/es/hub.json | 13 +- 4 files changed, 381 insertions(+), 299 deletions(-) create mode 100644 DashAI/front/src/components/hub/HubImportPanel.jsx delete mode 100644 DashAI/front/src/components/hub/ImportDatasetDialog.jsx diff --git a/DashAI/front/src/components/hub/HubImportPanel.jsx b/DashAI/front/src/components/hub/HubImportPanel.jsx new file mode 100644 index 000000000..97e1e7e2b --- /dev/null +++ b/DashAI/front/src/components/hub/HubImportPanel.jsx @@ -0,0 +1,357 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { + Box, + Button, + Breadcrumbs, + CircularProgress, + IconButton, + Link, + TextField, + Typography, +} from "@mui/material"; +import ArrowBackIcon from "@mui/icons-material/ArrowBack"; +import { useSnackbar } from "notistack"; +import { useTranslation } from "react-i18next"; +import { useNavigate } from "react-router-dom"; +import { createDataset } from "../../api/datasets"; +import { + getComponentInfo, + importHubDataset, + previewHubDataset, +} from "../../api/hub"; +import ComponentSelector from "../custom/ComponentSelector"; +import PreviewDataset from "../notebooks/datasetCreation/PreviewDataset"; + +export default function HubImportPanel({ + dataset, + sourceName, + compatibleComponents = [], + step, + onStepChange, + selectedLoader, + onSelectedLoaderChange, + formValues = {}, + formHasErrors = false, + onCancel, + onImported, +}) { + const { t } = useTranslation(["hub", "common", "datasets"]); + const { enqueueSnackbar } = useSnackbar(); + const navigate = useNavigate(); + const [localStep, setLocalStep] = useState(0); + const [localSelectedLoader, setLocalSelectedLoader] = useState(null); + const stepValue = step ?? localStep; + const setStepValue = onStepChange ?? setLocalStep; + const selectedValue = selectedLoader ?? localSelectedLoader; + + const [dataloaders, setDataloaders] = useState([]); + const [loadingDataloaders, setLoadingDataloaders] = useState(false); + + const [name, setName] = useState(""); + const [previewData, setPreviewData] = useState(null); + const [previewLoading, setPreviewLoading] = useState(false); + const [previewError, setPreviewError] = useState(false); + const [columnTypes, setColumnTypes] = useState({}); + const [columnRenames, setColumnRenames] = useState({}); + const [importing, setImporting] = useState(false); + const previewDebounceRef = useRef(null); + + useEffect(() => { + if (!dataset || !sourceName) return; + setStepValue(0); + setName(dataset.name || ""); + setLocalSelectedLoader(null); + onSelectedLoaderChange?.(null); + setDataloaders([]); + setPreviewData(null); + setPreviewError(false); + setColumnTypes({}); + setColumnRenames({}); + }, [dataset?.id, sourceName, onSelectedLoaderChange, setStepValue]); + + useEffect(() => { + if (!compatibleComponents.length) { + setDataloaders([]); + setLocalSelectedLoader(null); + onSelectedLoaderChange?.(null); + return; + } + + let isMounted = true; + setLoadingDataloaders(true); + Promise.all(compatibleComponents.map(getComponentInfo)) + .then((infos) => { + if (!isMounted) return; + setDataloaders(infos); + if (infos.length === 1) { + setLocalSelectedLoader(infos[0]); + onSelectedLoaderChange?.(infos[0]); + } + }) + .catch(() => { + if (isMounted) setDataloaders([]); + }) + .finally(() => { + if (isMounted) setLoadingDataloaders(false); + }); + + return () => { + isMounted = false; + }; + }, [compatibleComponents]); + + useEffect(() => { + if (stepValue !== 1 || !dataset || !sourceName) return; + + let isMounted = true; + if (previewDebounceRef.current) { + clearTimeout(previewDebounceRef.current); + } + + setPreviewData(null); + setPreviewLoading(true); + setPreviewError(false); + + const rows = Number(formValues?.inference_rows); + const effectiveRows = Number.isFinite(rows) + ? Math.min(Math.max(2, rows), 500) + : 100; + + previewDebounceRef.current = setTimeout(() => { + previewHubDataset(sourceName, dataset.id, effectiveRows) + .then((data) => { + if (!isMounted) return; + setPreviewData(data); + setColumnTypes(data.inferred_types || {}); + }) + .catch(() => { + if (isMounted) setPreviewError(true); + }) + .finally(() => { + if (isMounted) setPreviewLoading(false); + }); + }, 350); + + return () => { + isMounted = false; + if (previewDebounceRef.current) { + clearTimeout(previewDebounceRef.current); + } + }; + }, [ + stepValue, + dataset?.id, + sourceName, + selectedValue?.name, + JSON.stringify(formValues || {}), + ]); + + const handleColumnRename = useCallback((oldName, newName) => { + setColumnRenames((prev) => ({ ...prev, [oldName]: newName })); + }, []); + + const handleImport = async () => { + if (!name.trim() || !dataset || !selectedValue || formHasErrors) return; + setImporting(true); + try { + const created = await createDataset(name.trim()); + await importHubDataset(sourceName, dataset.id, created.id, { + dataloader: selectedValue.name, + dataloader_params: formValues, + inferred_types: columnTypes, + column_renames: columnRenames, + }); + enqueueSnackbar(t("hub:importSuccess"), { variant: "success" }); + onImported?.(); + } catch { + enqueueSnackbar(t("hub:importError"), { variant: "error" }); + } finally { + setImporting(false); + } + }; + + const canProceed = !!selectedValue?.name; + const canImport = + !!selectedValue?.name && + !!name.trim() && + !previewLoading && + !previewError && + !!previewData && + !formHasErrors && + !importing; + + return ( + + + + navigate("/app/data")} + size="small" + sx={{ + color: "text.secondary", + "&:hover": { + color: "primary.main", + backgroundColor: "action.hover", + }, + }} + aria-label={t("common:back")} + > + + + + { + e.preventDefault(); + navigate("/app/data"); + }} + > + {t("common:datasets")} + + { + e.preventDefault(); + navigate("/app/hub"); + }} + > + {t("common:hub")} + + + {t("hub:importDataset")} + + + + + + + {stepValue === 0 && ( + + + + {t("hub:stepDataloaderTitle")} + + + {t("hub:stepDataloaderSubtitle")} + + + + {loadingDataloaders ? ( + + + + ) : compatibleComponents.length === 0 ? ( + + {t("hub:noCompatibleDataloaders")} + + ) : ( + + { + setLocalSelectedLoader(item); + onSelectedLoaderChange?.(item); + }} + searchPlaceholder={t("datasets:searchDataloaders", { + defaultValue: "Search data loaders...", + })} + /> + + )} + + )} + + {stepValue === 1 && ( + + + + {t("hub:stepPreviewTitle")} + + + {t("hub:stepPreviewSubtitle")} + + + setName(e.target.value)} + fullWidth + /> + + {previewLoading && ( + + + + )} + + {previewError && !previewLoading && ( + {t("hub:previewError")} + )} + + {!previewLoading && !previewError && previewData && ( + + )} + + )} + + + + {stepValue === 0 ? ( + + ) : ( + + )} + + {stepValue === 0 ? ( + + ) : ( + + )} + + + ); +} diff --git a/DashAI/front/src/components/hub/ImportDatasetDialog.jsx b/DashAI/front/src/components/hub/ImportDatasetDialog.jsx deleted file mode 100644 index c37fe8c85..000000000 --- a/DashAI/front/src/components/hub/ImportDatasetDialog.jsx +++ /dev/null @@ -1,297 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from "react"; -import { - Box, - Button, - CircularProgress, - Dialog, - DialogActions, - DialogContent, - DialogTitle, - FormControl, - FormControlLabel, - Radio, - RadioGroup, - Step, - StepLabel, - Stepper, - TextField, - Typography, -} from "@mui/material"; -import { useSnackbar } from "notistack"; -import { useTranslation } from "react-i18next"; -import { createDataset } from "../../api/datasets"; -import { getComponentInfo, importHubDataset, previewHubDataset } from "../../api/hub"; -import ParameterForm from "../configurableObject/ParameterForm"; -import PreviewDataset from "../notebooks/datasetCreation/PreviewDataset"; - -const STEPS = ["hub:stepFormat", "hub:stepParameters", "hub:stepPreview"]; - -/** - * Dialog that walks the user through format selection, dataloader parameter - * configuration, dataset preview, and final import into DashAI. - * - * @param {boolean} open - Whether the dialog is open. - * @param {function} onClose - Called when the dialog is dismissed. - * @param {string} sourceName - DatasetSource class name. - * @param {object|null} dataset - DatasetEntry to import. - * @param {string[]} compatibleComponents - DataLoader class names compatible with this source. - * @param {function} onImported - Called after successful import. - */ -export default function ImportDatasetDialog({ - open, - onClose, - sourceName, - dataset, - compatibleComponents = [], - onImported, -}) { - const { t } = useTranslation(["hub", "common"]); - const { enqueueSnackbar } = useSnackbar(); - const paramsFormRef = useRef(null); - - const [step, setStep] = useState(0); - const [selectedLoader, setSelectedLoader] = useState(null); - const [loaderInfos, setLoaderInfos] = useState({}); - const [loadingInfos, setLoadingInfos] = useState(false); - const [loaderParams, setLoaderParams] = useState({}); - - const [name, setName] = useState(""); - const [previewData, setPreviewData] = useState(null); - const [previewLoading, setPreviewLoading] = useState(false); - const [previewError, setPreviewError] = useState(false); - const [columnTypes, setColumnTypes] = useState({}); - const [columnRenames, setColumnRenames] = useState({}); - const [importing, setImporting] = useState(false); - - useEffect(() => { - if (!open || !dataset || !sourceName) return; - setName(dataset.name || ""); - setStep(0); - setSelectedLoader(compatibleComponents.length === 1 ? compatibleComponents[0] : null); - setLoaderParams({}); - setPreviewData(null); - setPreviewError(false); - setColumnTypes({}); - setColumnRenames({}); - - if (!compatibleComponents.length) return; - setLoadingInfos(true); - Promise.all(compatibleComponents.map(getComponentInfo)) - .then((infos) => { - const map = {}; - infos.forEach((info) => { - map[info.name] = info; - }); - setLoaderInfos(map); - }) - .catch(() => {}) - .finally(() => setLoadingInfos(false)); - }, [open, dataset, sourceName, compatibleComponents]); - - const fetchPreview = useCallback(() => { - setPreviewData(null); - setPreviewLoading(true); - setPreviewError(false); - previewHubDataset(sourceName, dataset.id, 100) - .then((data) => { - setPreviewData(data); - setColumnTypes(data.inferred_types || {}); - }) - .catch(() => setPreviewError(true)) - .finally(() => setPreviewLoading(false)); - }, [sourceName, dataset]); - - const handleColumnRename = useCallback((oldName, newName) => { - setColumnRenames((prev) => ({ ...prev, [oldName]: newName })); - }, []); - - const handleParamsSubmit = useCallback( - (values) => { - setLoaderParams(values); - fetchPreview(); - setStep(2); - }, - [fetchPreview], - ); - - const handleNext = () => { - if (step === 0) { - setStep(1); - } else if (step === 1) { - if (paramsFormRef.current) { - paramsFormRef.current.handleSubmit(); - } else { - fetchPreview(); - setStep(2); - } - } - }; - - const handleBack = () => setStep((s) => Math.max(0, s - 1)); - - const handleImport = async () => { - if (!name.trim() || !dataset) return; - setImporting(true); - try { - const created = await createDataset(name.trim()); - await importHubDataset(sourceName, dataset.id, created.id, { - dataloader: selectedLoader, - dataloader_params: loaderParams, - inferred_types: columnTypes, - column_renames: columnRenames, - }); - enqueueSnackbar(t("hub:importSuccess"), { variant: "success" }); - onImported?.(); - onClose(); - } catch { - enqueueSnackbar(t("hub:importError"), { variant: "error" }); - } finally { - setImporting(false); - } - }; - - const selectedLoaderInfo = selectedLoader ? loaderInfos[selectedLoader] : null; - - return ( - - {t("hub:importDataset")} - - - {STEPS.map((key) => ( - - {t(key)} - - ))} - - - {/* Step 0: Format selection */} - {step === 0 && ( - - {loadingInfos ? ( - - - - ) : ( - <> - - {t("hub:selectFormatDescription")} - - - setSelectedLoader(e.target.value)} - > - {compatibleComponents.map((loaderName) => { - const info = loaderInfos[loaderName]; - return ( - } - label={ - - - {info?.display_name || loaderName} - - {info?.description && ( - - {info.description} - - )} - - } - /> - ); - })} - - - - )} - - )} - - {/* Step 1: Dataloader parameters */} - {step === 1 && ( - - {!selectedLoaderInfo ? ( - - - - ) : selectedLoaderInfo.schema ? ( - - ) : ( - - {t("hub:noParameters")} - - )} - - )} - - {/* Step 2: Preview and confirm */} - {step === 2 && ( - - - setName(e.target.value)} - fullWidth - /> - - - {previewLoading && ( - - - - )} - - {previewError && !previewLoading && ( - {t("hub:previewError")} - )} - - {!previewLoading && !previewError && previewData && ( - - )} - - )} - - - - - {step > 0 && ( - - )} - {step < 2 ? ( - - ) : ( - - )} - - - ); -} diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json index 000fe580b..fedef0cd7 100644 --- a/DashAI/front/src/utils/i18n/locales/en/hub.json +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -22,6 +22,17 @@ "stepFormat": "Format", "stepParameters": "Parameters", "stepPreview": "Preview", + "stepDataloader": "DataLoader", + "stepPreviewParameters": "Preview & Parameters", "selectFormatDescription": "Select the file format that matches this dataset.", - "noParameters": "No additional parameters required." + "selectDataloaderDescription": "Select the DataLoader to use for this dataset.", + "noParameters": "No additional parameters required.", + "noCompatibleDataloaders": "No compatible DataLoaders available for this source.", + "stepDataloaderTitle": "Select DataLoader", + "stepDataloaderSubtitle": "Choose how this dataset will be loaded.", + "stepPreviewTitle": "Preview and configure", + "stepPreviewSubtitle": "Review the preview and adjust parameters in the right panel.", + "previewParameters": "Preview and parameters", + "previewRows": "Preview rows", + "previewRowsDescription": "Number of rows to sample for preview." } diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json index f22103454..c71cdb5c0 100644 --- a/DashAI/front/src/utils/i18n/locales/es/hub.json +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -22,6 +22,17 @@ "stepFormat": "Formato", "stepParameters": "Parámetros", "stepPreview": "Vista previa", + "stepDataloader": "DataLoader", + "stepPreviewParameters": "Vista previa y parámetros", "selectFormatDescription": "Selecciona el formato del archivo que corresponde a este dataset.", - "noParameters": "No se requieren parámetros adicionales." + "selectDataloaderDescription": "Selecciona el DataLoader para este dataset.", + "noParameters": "No se requieren parámetros adicionales.", + "noCompatibleDataloaders": "No hay DataLoaders compatibles disponibles para esta fuente.", + "stepDataloaderTitle": "Seleccionar DataLoader", + "stepDataloaderSubtitle": "Elige como se cargara este dataset.", + "stepPreviewTitle": "Vista previa y configuracion", + "stepPreviewSubtitle": "Revisa la vista previa y ajusta los parametros en el panel derecho.", + "previewParameters": "Vista previa y parámetros", + "previewRows": "Filas de vista previa", + "previewRowsDescription": "Número de filas para muestrear en la vista previa." } From eb7f88d15f75f7bd1ea7344d85a366e2221c52f5 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 10:02:57 -0400 Subject: [PATCH 036/361] feat: wire HubImportPanel into HubContent with DataloaderConfigBar right panel --- DashAI/front/src/pages/hub/HubContent.jsx | 87 +++++++++++++++++++---- 1 file changed, 75 insertions(+), 12 deletions(-) diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index 6516ddcf5..e819a2b25 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -1,4 +1,4 @@ -import { useState } from "react"; +import { useRef, useState } from "react"; import ModuleContainer from "../../components/layout/ModuleContainer"; import LeftPanel from "../../components/threeSectionLayout/panels/LeftPanel"; import CenterPanel from "../../components/threeSectionLayout/panels/CenterPanel"; @@ -8,21 +8,56 @@ import { useThreePanelLayout } from "../../hooks/useThreePanelsLayout"; import HubLeftBar from "../../components/hub/HubLeftBar"; import DatasetGrid from "../../components/hub/DatasetGrid"; import DatasetDetail from "../../components/hub/DatasetDetail"; +import HubImportPanel from "../../components/hub/HubImportPanel"; +import ComponentDetailsPanel from "../../components/custom/ComponentDetailsPanel"; +import DataloaderConfigBar from "../../components/notebooks/datasetCreation/DataloaderConfigBar"; export default function HubContent() { const threePanelLayout = useThreePanelLayout({ storageKey: "hub" }); const [selectedSource, setSelectedSource] = useState(null); const [selectedDataset, setSelectedDataset] = useState(null); + const [importMode, setImportMode] = useState(false); + const [importStep, setImportStep] = useState(0); + const [selectedDataloader, setSelectedDataloader] = useState(null); + const [formValues, setFormValues] = useState({}); + const [formHasErrors, setFormHasErrors] = useState(false); + const formSubmitRef = useRef(null); const sourceName = selectedSource?.name ?? null; const handleSelectSource = (source) => { setSelectedSource(source); setSelectedDataset(null); + setImportMode(false); + setImportStep(0); + setSelectedDataloader(null); + setFormValues({}); + setFormHasErrors(false); }; const handleImported = () => { setSelectedDataset(null); + setImportMode(false); + setImportStep(0); + setSelectedDataloader(null); + setFormValues({}); + setFormHasErrors(false); + }; + + const handleStartImport = () => { + setImportMode(true); + setImportStep(0); + setSelectedDataloader(null); + setFormValues({}); + setFormHasErrors(false); + }; + + const handleExitImport = () => { + setImportMode(false); + setImportStep(0); + setSelectedDataloader(null); + setFormValues({}); + setFormHasErrors(false); }; return ( @@ -36,20 +71,48 @@ export default function HubContent() { - + {importMode ? ( + + ) : ( + + )} - + {importMode ? ( + importStep === 0 ? ( + + ) : ( + + ) + ) : ( + + )} From 09b1983d4df03fb88273707ad56ae926161a01e0 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 10:04:47 -0400 Subject: [PATCH 037/361] feat: add POST /preview endpoint that uses dataloader params for accurate preview --- .../api/api_v1/endpoints/dataset_source.py | 106 +++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index ea860149c..9c1593d79 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -246,6 +246,108 @@ async def preview_dataset( } +class PreviewRequest(BaseModel): + """Request body for previewing a dataset with dataloader params. + + Parameters + ---------- + dataloader : str | None + Name of the DataLoader to use for parsing the file. + params : dict + DataLoader parameters (e.g., separator for CSV). + n_rows : int + Number of rows to sample (1-500). + """ + + dataloader: str | None = None + params: Dict[str, Any] = {} + n_rows: int = 100 + + +@router.post("/{source_name}/{dataset_id:path}/preview") +async def preview_dataset_with_params( + source_name: str, + dataset_id: str, + body: PreviewRequest, + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), +) -> Dict[str, Any]: + """Fetch a sample preview using a DataLoader and params. + + Parameters + ---------- + source_name : str + Registered DatasetSource class name. + dataset_id : str + Source-specific dataset identifier (URL-encoded). + body : PreviewRequest + DataLoader name, params, and row count. + registry : ComponentRegistry + Injected component registry. + + Returns + ------- + dict + ``{"sample": [...], "inferred_types": {...}, "preview_row_count": int}``. + """ + import tempfile + + source = _get_source(source_name, registry) + decoded_id = unquote(dataset_id) + n_rows = max(1, min(body.n_rows, 500)) + + try: + with tempfile.TemporaryDirectory() as temp_dir: + file_path, default_dataloader = source.fetch_full(decoded_id, temp_dir) + dataloader_name = body.dataloader or default_dataloader + dl_registry = registry._registry.get("DataLoader", {}) + if dataloader_name not in dl_registry: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"DataLoader '{dataloader_name}' not found.", + ) + + dataloader = dl_registry[dataloader_name]["class"]() + params = body.params or {} + + if hasattr(dataloader, "load_preview"): + df = dataloader.load_preview( + filepath_or_buffer=file_path, + params=params, + n_rows=n_rows, + ) + else: + dataset = dataloader.load_data( + filepath_or_buffer=file_path, + temp_path=temp_dir, + params=params, + n_sample=n_rows, + ) + df = dataset.to_pandas().head(n_rows) + except HTTPException: + raise + except Exception as exc: + log.exception("Error fetching preview for %s/%s", source_name, decoded_id) + raise HTTPException( + status_code=status.HTTP_502_BAD_GATEWAY, + detail=f"Failed to fetch preview from source: {exc}", + ) from exc + + if df.empty: + raise HTTPException( + status_code=status.HTTP_502_BAD_GATEWAY, + detail=f"Source returned no data for dataset '{decoded_id}'.", + ) + + inferred = infer_types(df, method="DashAIPtype") + sample = df.to_dict(orient="records") + + return { + "sample": sample, + "inferred_types": inferred, + "preview_row_count": len(df), + } + + class ImportRequest(BaseModel): """Request body for the dataset import endpoint. @@ -261,7 +363,9 @@ class ImportRequest(BaseModel): params: Dict[str, Any] = {} -@router.post("/{source_name}/{dataset_id:path}/import", status_code=status.HTTP_201_CREATED) +@router.post( + "/{source_name}/{dataset_id:path}/import", status_code=status.HTTP_201_CREATED +) async def import_dataset( source_name: str, dataset_id: str, From f0edd26e21941347e5ea2b1c7e51337caef92d9b Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 10:04:52 -0400 Subject: [PATCH 038/361] fix: validate selected dataloader against COMPATIBLE_COMPONENTS in DatasetJob --- DashAI/back/job/dataset_job.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 531c598e9..8b6855155 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -167,6 +167,7 @@ def run( if source_name: # --- Hub import path --- import tempfile + hub_temp = tempfile.mkdtemp() temp_dir = hub_temp # ensures finally block cleans it up @@ -177,19 +178,31 @@ def run( f"DatasetSource '{source_name}' not found in registry." ) source = sources[source_name]["class"]() - file_path_hub, dataloader_name = source.fetch_full( + file_path_hub, source_dataloader_name = source.fetch_full( dataset_source_id, hub_temp ) + selected_dataloader = ( + params.get("dataloader") or source_dataloader_name + ) + compatible = getattr(source, "COMPATIBLE_COMPONENTS", []) + if ( + params.get("dataloader") + and compatible + and selected_dataloader not in compatible + ): + raise JobError( + "Selected DataLoader is not compatible with this source." + ) dl_registry = component_registry._registry.get("DataLoader", {}) - if dataloader_name not in dl_registry: + if selected_dataloader not in dl_registry: raise JobError( - f"DataLoader '{dataloader_name}' not found in registry." + f"DataLoader '{selected_dataloader}' not found in registry." ) - dataloader = dl_registry[dataloader_name]["class"]() + dataloader = dl_registry[selected_dataloader]["class"]() log.debug( "Loading hub dataset from %s using %s", file_path_hub, - dataloader_name, + selected_dataloader, ) hub_loader_params = params.get("dataloader_params", {}) new_dataset = dataloader.load_data( @@ -201,7 +214,9 @@ def run( else: # --- File / URL upload path (unchanged) --- parsed_params = parse_params(DatasetParams, json.dumps(params)) - dataloader = component_registry[parsed_params.dataloader]["class"]() + dataloader = component_registry[parsed_params.dataloader][ + "class" + ]() log.debug("Storing dataset in %s", folder_path) new_dataset = dataloader.load_data( filepath_or_buffer=( From c53650cb562b977db9b4bb88d650553586251d4c Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 10:04:57 -0400 Subject: [PATCH 039/361] fix: use POST preview when dataloader/params given, fix component URL to singular --- DashAI/front/src/api/hub.ts | 28 +++++++++++++------ .../src/components/hub/HubImportPanel.jsx | 2 +- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index 7c393c2ae..fabf674a9 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -57,9 +57,11 @@ export const getDatasetInfo = async ( datasetId: string, ): Promise<{ id?: string; description?: string; tags?: string[] }> => { const encodedId = encodeURIComponent(datasetId); - const response = await api.get<{ id?: string; description?: string; tags?: string[] }>( - `${hubEndpoint}/${sourceName}/${encodedId}/info`, - ); + const response = await api.get<{ + id?: string; + description?: string; + tags?: string[]; + }>(`${hubEndpoint}/${sourceName}/${encodedId}/info`); return response.data; }; @@ -78,12 +80,20 @@ export const previewHubDataset = async ( sourceName: string, datasetId: string, nRows = 100, + dataloader?: string, + params?: Record, ): Promise => { const encodedId = encodeURIComponent(datasetId); - const response = await api.get( - `${hubEndpoint}/${sourceName}/${encodedId}/preview`, - { params: { n_rows: nRows } }, - ); + const response = + dataloader || params + ? await api.post( + `${hubEndpoint}/${sourceName}/${encodedId}/preview`, + { dataloader, params: params ?? {}, n_rows: nRows }, + ) + : await api.get( + `${hubEndpoint}/${sourceName}/${encodedId}/preview`, + { params: { n_rows: nRows } }, + ); return response.data; }; @@ -104,6 +114,8 @@ export const importHubDataset = async ( export const getComponentInfo = async ( componentName: string, ): Promise => { - const response = await api.get(`/v1/components/${componentName}/`); + const response = await api.get( + `/v1/component/${componentName}/`, + ); return response.data; }; diff --git a/DashAI/front/src/components/hub/HubImportPanel.jsx b/DashAI/front/src/components/hub/HubImportPanel.jsx index 97e1e7e2b..eaf9a487c 100644 --- a/DashAI/front/src/components/hub/HubImportPanel.jsx +++ b/DashAI/front/src/components/hub/HubImportPanel.jsx @@ -118,7 +118,7 @@ export default function HubImportPanel({ : 100; previewDebounceRef.current = setTimeout(() => { - previewHubDataset(sourceName, dataset.id, effectiveRows) + previewHubDataset(sourceName, dataset.id, effectiveRows, selectedValue?.name, formValues) .then((data) => { if (!isMounted) return; setPreviewData(data); From ef81119b86d800e7dda4d4834431b75bd79a0df3 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 15:02:36 -0400 Subject: [PATCH 040/361] fix: decouple preview endpoints from hardcoded dataloader detection Replace extension-sniffing if/elif chains in preview_with_types and validate_type_changes with dataloader-name-from-params lookup and SUPPORTED_EXTENSIONS class metadata. Each dataloader declares its own accepted extensions including .zip; the frontend reads metadata.supported_extensions instead of matching on the dataloader name string. Add _clean_value helper to safely handle non-scalar cell values (arrays, inf, nan) in pandas 2.x without crashing replace(). --- DashAI/back/api/api_v1/endpoints/datasets.py | 101 +++++------------- .../dataloaders/classes/csv_dataloader.py | 1 + DashAI/back/dataloaders/classes/dataloader.py | 2 + .../dataloaders/classes/excel_dataloader.py | 1 + .../dataloaders/classes/json_dataloader.py | 1 + .../notebooks/datasetCreation/Upload.jsx | 28 ++--- 6 files changed, 46 insertions(+), 88 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 5ea732ac6..50e31a6d8 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -1632,61 +1632,48 @@ async def preview_with_types( try: inference_rows = parsed_params.get("inference_rows", 1000) + dataloader_name = parsed_params.get("dataloader_name") + + if not dataloader_name: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="dataloader_name is required in params.", + ) + + if dataloader_name not in component_registry: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Dataloader {dataloader_name} not found in registry.", + ) + + dataloader_cls = component_registry[dataloader_name]["class"] + dataloader = dataloader_cls() + if file.filename.endswith(".zip"): + allowed_exts = dataloader_cls.SUPPORTED_EXTENSIONS extract_dir = tempfile.mkdtemp() try: with zipfile.ZipFile(tmp_file_path, "r") as zf: zf.extractall(extract_dir) - supported_map = { - ".csv": "CSVDataLoader", - ".json": "JSONDataLoader", - ".xlsx": "ExcelDataLoader", - ".xls": "ExcelDataLoader", - } - dataloader_name = None matched_file = None for root, _, files in os.walk(extract_dir): for f in files: - ext = os.path.splitext(f)[1].lower() - if ext in supported_map: - dataloader_name = supported_map[ext] + if os.path.splitext(f)[1].lower() in allowed_exts: matched_file = os.path.join(root, f) break - if dataloader_name: + if matched_file: break - if dataloader_name is None: + if matched_file is None: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=( - "ZIP file does not contain any supported dataset files." - "Supported inner files: .csv, .json, .xlsx, .xls" - ), - ) - - if dataloader_name not in component_registry: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=( - f"Dataloader {dataloader_name} not found in registry." + f"ZIP does not contain any file supported by " + f"{dataloader_name}." ), ) - dataloader = component_registry[dataloader_name]["class"]() - - if ( - dataloader_name == "CSVDataLoader" - and "separator" not in parsed_params - ): - parsed_params["separator"] = "," - if ( - dataloader_name == "JSONDataLoader" - and "data_key" not in parsed_params - ): - parsed_params["data_key"] = None - - # load_preview using the matched inner file path loaded_dataset = dataloader.load_preview( filepath_or_buffer=matched_file, params=parsed_params, @@ -1694,40 +1681,10 @@ async def preview_with_types( ) finally: - # cleanup extracted dir with contextlib.suppress(Exception): shutil.rmtree(extract_dir, ignore_errors=True) else: - if file.filename.endswith(".csv"): - dataloader_name = "CSVDataLoader" - if "separator" not in parsed_params: - parsed_params["separator"] = "," - - elif file.filename.endswith(".xlsx") or file.filename.endswith(".xls"): - dataloader_name = "ExcelDataLoader" - - elif file.filename.endswith(".json"): - dataloader_name = "JSONDataLoader" - if "data_key" not in parsed_params: - parsed_params["data_key"] = None - else: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=( - "Unsupported file type. Only CSV, Excel and JSON files are " - "supported." - ), - ) - - if dataloader_name not in component_registry: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Dataloader {dataloader_name} not found in registry.", - ) - - dataloader = component_registry[dataloader_name]["class"]() - loaded_dataset = dataloader.load_preview( filepath_or_buffer=tmp_file_path, params=parsed_params, @@ -1797,16 +1754,12 @@ async def validate_type_changes( tmp_file_path = tmp_file.name try: - if file.filename.endswith(".csv"): - dataloader_name = "CSVDataLoader" - elif file.filename.endswith((".xlsx", ".xls")): - dataloader_name = "ExcelDataLoader" - elif file.filename.endswith(".json"): - dataloader_name = "JSONDataLoader" - else: + dataloader_name = parsed_params.get("dataloader_name") + + if not dataloader_name: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail="Unsupported file type", + detail="dataloader_name is required in params.", ) if dataloader_name not in component_registry: diff --git a/DashAI/back/dataloaders/classes/csv_dataloader.py b/DashAI/back/dataloaders/classes/csv_dataloader.py index 6d48df790..ae6553e72 100644 --- a/DashAI/back/dataloaders/classes/csv_dataloader.py +++ b/DashAI/back/dataloaders/classes/csv_dataloader.py @@ -216,6 +216,7 @@ class CSVDataLoader(BaseDataLoader): COMPATIBLE_COMPONENTS = ["TabularClassificationTask"] SCHEMA = CSVDataloaderSchema + SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".csv", ".zip"}) DESCRIPTION: str = MultilingualString( en=( diff --git a/DashAI/back/dataloaders/classes/dataloader.py b/DashAI/back/dataloaders/classes/dataloader.py index 467c04e99..a315d03ca 100644 --- a/DashAI/back/dataloaders/classes/dataloader.py +++ b/DashAI/back/dataloaders/classes/dataloader.py @@ -23,11 +23,13 @@ class BaseDataLoader(ConfigObject): en="File Uploading", es="Carga de Archivos", ) + SUPPORTED_EXTENSIONS: frozenset[str] = frozenset() @classmethod def get_metadata(cls) -> Dict[str, Any]: return { "category": cls.CATEGORY if cls.CATEGORY else "File Uploading", + "supported_extensions": sorted(cls.SUPPORTED_EXTENSIONS), } @abstractmethod diff --git a/DashAI/back/dataloaders/classes/excel_dataloader.py b/DashAI/back/dataloaders/classes/excel_dataloader.py index 183ea1a29..2b056cc0c 100644 --- a/DashAI/back/dataloaders/classes/excel_dataloader.py +++ b/DashAI/back/dataloaders/classes/excel_dataloader.py @@ -194,6 +194,7 @@ class ExcelDataLoader(BaseDataLoader): Handles multi-file uploads by concatenating all workbooks before splitting. """ + SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".xlsx", ".xls", ".zip"}) COMPATIBLE_COMPONENTS = ["TabularClassificationTask"] SCHEMA = ExcelDataloaderSchema diff --git a/DashAI/back/dataloaders/classes/json_dataloader.py b/DashAI/back/dataloaders/classes/json_dataloader.py index 897b43eae..30ca43913 100644 --- a/DashAI/back/dataloaders/classes/json_dataloader.py +++ b/DashAI/back/dataloaders/classes/json_dataloader.py @@ -57,6 +57,7 @@ class JSONDataLoader(BaseDataLoader): are validated before loading to provide early failure feedback. """ + SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".json", ".zip"}) COMPATIBLE_COMPONENTS = [ "TabularClassificationTask", "TextClassificationTask", diff --git a/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx b/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx index d506b2663..66a78e045 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx @@ -227,12 +227,19 @@ function Upload({ // memoize datasetData object so its reference stays stable across renders const datasetDataMemo = useMemo(() => { + let dataloaderName = selectedDataloader; + if (selectedDataloader && typeof selectedDataloader === "object") { + dataloaderName = + selectedDataloader.name || selectedDataloader.display_name || null; + } + const params = { ...formValues, inference_rows: formValues && formValues.inference_rows != null ? formValues.inference_rows : 1000, + ...(dataloaderName ? { dataloader_name: dataloaderName } : {}), }; return { @@ -244,20 +251,13 @@ function Upload({ const acceptAttr = useMemo(() => { if (!selectedDataloader) return undefined; - let s = selectedDataloader; - if (typeof selectedDataloader === "object") { - s = selectedDataloader.name || selectedDataloader.display_name || ""; - } - if (!s || typeof s !== "string") return undefined; - s = s.toLowerCase(); - // CSV dataloader: accept .csv and .zip (zipped CSVs) - if (s.includes("csv")) return ".csv,.zip"; - // JSON dataloader: accept .json and .zip - if (s.includes("json")) return ".json,.zip"; - // Images or generic image loaders - if (s.includes("excel")) return ".xls,.xlsx,.zip"; - // Default: no restriction - return undefined; + const extensions = + typeof selectedDataloader === "object" + ? selectedDataloader.metadata?.supported_extensions + : undefined; + + if (!extensions || extensions.length === 0) return undefined; + return extensions.join(","); }, [selectedDataloader]); // renders content inside the drag and drop component depending on the state of the dataset From 5142d412f73c60de90e4770e8b98bb530c1de89c Mon Sep 17 00:00:00 2001 From: Creylay Date: Wed, 6 May 2026 18:30:23 -0400 Subject: [PATCH 041/361] feat: implement StepperNavigationFooter for consistent button navigation across multiple components Co-authored-by: Copilot --- .../generative/CreateSessionCenter.jsx | 47 ++------ .../components/models/CreateSessionSteps.jsx | 33 ++---- .../FormConverterSection.jsx | 6 +- .../ParameterStepConverter.jsx | 12 +- .../converterCreation/ScopeStepConverter.jsx | 36 +++--- .../ConfigureAndUploadDatasetStep.jsx | 35 ++---- .../datasetCreation/SelectDataloaderStep.jsx | 31 ++--- .../explorerCreation/FormExplorerSection.jsx | 6 +- .../ParameterStepExplorer.jsx | 12 +- .../explorerCreation/ScopeStepExplorer.jsx | 19 +--- .../notebookCreation/UploadNotebookSteps.jsx | 33 ++---- .../notebooks/tool/ConfigureToolModal.jsx | 4 +- .../src/components/shared/FormSchema.jsx | 12 +- .../shared/FormSchemaButtonGroup.jsx | 47 ++++---- .../shared/FormSchemaParameterContainer.jsx | 6 +- .../shared/FormSchemaWithSelectedModel.jsx | 11 +- .../shared/StepperNavigationFooter.jsx | 107 ++++++++++++++++++ 17 files changed, 255 insertions(+), 202 deletions(-) create mode 100644 DashAI/front/src/components/shared/StepperNavigationFooter.jsx diff --git a/DashAI/front/src/components/generative/CreateSessionCenter.jsx b/DashAI/front/src/components/generative/CreateSessionCenter.jsx index f1419cad9..e45f20bba 100644 --- a/DashAI/front/src/components/generative/CreateSessionCenter.jsx +++ b/DashAI/front/src/components/generative/CreateSessionCenter.jsx @@ -1,6 +1,5 @@ import { Box, - Button, CircularProgress, Stack, TextField, @@ -10,6 +9,7 @@ import { useTranslation } from "react-i18next"; import ComponentSelector from "../custom/ComponentSelector"; import GenerativeBreadcrumbs from "./GenerativeBreadcrumbs"; import { useCreateSession } from "./CreateSessionContext"; +import StepperNavigationFooter from "../shared/StepperNavigationFooter"; export default function CreateSessionCenter() { const { t } = useTranslation(["generative", "common"]); @@ -112,40 +112,17 @@ export default function CreateSessionCenter() { )} - - - {step === 0 ? ( - - ) : ( - - )} - + ); } diff --git a/DashAI/front/src/components/models/CreateSessionSteps.jsx b/DashAI/front/src/components/models/CreateSessionSteps.jsx index c96e1d079..5fcfae908 100644 --- a/DashAI/front/src/components/models/CreateSessionSteps.jsx +++ b/DashAI/front/src/components/models/CreateSessionSteps.jsx @@ -1,6 +1,6 @@ import { useState, useMemo, useEffect, useRef } from "react"; import PropTypes from "prop-types"; -import { Box, Button, Typography } from "@mui/material"; +import { Box, Typography } from "@mui/material"; import { useSnackbar } from "notistack"; import { useFormik } from "formik"; import { useTourContext } from "../tour/TourProvider"; @@ -12,6 +12,7 @@ import { getComponents } from "../../api/component"; import { generateSequentialName } from "../../utils/nameGenerator"; import { useTranslation } from "react-i18next"; import { useModels } from "./ModelsContext"; +import StepperNavigationFooter from "../shared/StepperNavigationFooter"; function CreateSessionSteps({ backHome, @@ -255,30 +256,12 @@ function CreateSessionSteps({ )} - - - - + ); } diff --git a/DashAI/front/src/components/notebooks/converterCreation/FormConverterSection.jsx b/DashAI/front/src/components/notebooks/converterCreation/FormConverterSection.jsx index e6bd89a2c..dc9e36ae0 100644 --- a/DashAI/front/src/components/notebooks/converterCreation/FormConverterSection.jsx +++ b/DashAI/front/src/components/notebooks/converterCreation/FormConverterSection.jsx @@ -15,6 +15,7 @@ export default function FormConverterSection({ handleClose, tool, notebook, + hideButtons = false, }) { const [targetColumn, setTargetColumn] = useState(null); const [rows, setRows] = useState([]); @@ -119,8 +120,9 @@ export default function FormConverterSection({ overflow: "visible", display: "flex", flexDirection: "column", - flexGrow: 1, + flex: 1, maxHeight: "100%", + minHeight: 0, }} > {step === 0 && ( @@ -139,6 +141,7 @@ export default function FormConverterSection({ ? () => setStep((s) => s + 1) : () => handleSaveConverter({}) } + hideButtons={hideButtons} /> )} @@ -148,6 +151,7 @@ export default function FormConverterSection({ initialParams={{}} handleSaveConverter={handleSaveConverter} setStep={setStep} + hideButtons={hideButtons} /> )} diff --git a/DashAI/front/src/components/notebooks/converterCreation/ParameterStepConverter.jsx b/DashAI/front/src/components/notebooks/converterCreation/ParameterStepConverter.jsx index 34db7a56c..9a36fe19c 100644 --- a/DashAI/front/src/components/notebooks/converterCreation/ParameterStepConverter.jsx +++ b/DashAI/front/src/components/notebooks/converterCreation/ParameterStepConverter.jsx @@ -10,6 +10,7 @@ export default function ParameterStepConverter({ initialParams, handleSaveConverter, setStep, + hideButtons = false, }) { const tourContext = useTourContext(); const { t } = useTranslation(["common", "datasets"]); @@ -55,7 +56,15 @@ export default function ParameterStepConverter({ }, [tourContext?.stepIndex, tourContext?.run]); return ( - + setStep(0)} saveButtonText={t("datasets:button.createConverter")} + hideButtons={hideButtons} /> diff --git a/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx b/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx index 3f83b6841..57886fbc4 100644 --- a/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx +++ b/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx @@ -24,6 +24,7 @@ export default function ScopeStepConverter({ setColumns, notebook, nextStep, + hideButtons = false, }) { const theme = useTheme(); const [datasetInfo, setDatasetInfo] = useState(0); @@ -85,16 +86,17 @@ export default function ScopeStepConverter({ sx={{ display: "flex", flexDirection: "column", - flexGrow: 1, + flex: 1, height: "100%", - gap: 1, + minHeight: 0, }} data-tour="column-selector-converter-container" > {/* Content */} {supervised && ( @@ -181,20 +182,19 @@ export default function ScopeStepConverter({ notebook={notebook} /> )} - - 0 - ? t("common:next") - : t("common:save") - } - data-tour="converter-scope-next-button" - /> + + {/* Buttons */} + 0 + ? t("common:next") + : t("common:save") + } + data-tour="converter-scope-next-button" + /> ); } diff --git a/DashAI/front/src/components/notebooks/datasetCreation/ConfigureAndUploadDatasetStep.jsx b/DashAI/front/src/components/notebooks/datasetCreation/ConfigureAndUploadDatasetStep.jsx index 72e3bee39..319e2b968 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/ConfigureAndUploadDatasetStep.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/ConfigureAndUploadDatasetStep.jsx @@ -1,5 +1,5 @@ import { useState, useEffect, useCallback, useMemo, useRef } from "react"; -import { Box, Button, Grid, TextField } from "@mui/material"; +import { Box, Grid, TextField } from "@mui/material"; import Upload from "./Upload"; import { useSnackbar } from "notistack"; import { enqueueDatasetJob as enqueueDatasetRequest } from "../../../api/job"; @@ -9,6 +9,7 @@ import { generateSequentialName } from "../../../utils/nameGenerator"; import { createDataset } from "../../../api/datasets"; import { useTranslation } from "react-i18next"; import { useTheme } from "@mui/material/styles"; +import StepperNavigationFooter from "../../shared/StepperNavigationFooter"; export default function ConfigureAndUploadDatasetStep({ selectedDataloader, @@ -233,31 +234,13 @@ export default function ConfigureAndUploadDatasetStep({ /> - - - - + ); } diff --git a/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx b/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx index 1a5271182..80cd7716e 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx @@ -1,8 +1,9 @@ import { useEffect } from "react"; import ComponentSelector from "../../custom/ComponentSelector"; -import { Box, Button, CircularProgress, Stack } from "@mui/material"; +import { Box, CircularProgress, Stack } from "@mui/material"; import { useTourContext } from "../../tour/TourProvider"; import { useTranslation } from "react-i18next"; +import StepperNavigationFooter from "../../shared/StepperNavigationFooter"; /** * This component renders a selector for available dataloaders @@ -91,29 +92,11 @@ export default function SelectDataloaderStep({ )} - - - - + ); } diff --git a/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx b/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx index ea841b734..c86c5733d 100644 --- a/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx +++ b/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx @@ -15,6 +15,7 @@ export default function FormExplorerSection({ handleClose, tool, notebook, + hideButtons = false, }) { const [classColumnInitialValue, setClassColumnInitialValue] = useState(null); const [scopeColumns, setScopeColumns] = useState([]); @@ -104,8 +105,9 @@ export default function FormExplorerSection({ overflow: "visible", display: "flex", flexDirection: "column", - flexGrow: 1, + flex: 1, maxHeight: "100%", + minHeight: 0, }} > {step === 0 && ( @@ -120,6 +122,7 @@ export default function FormExplorerSection({ ? () => setStep((s) => s + 1) : () => handleSaveExplorer({}) } + hideButtons={hideButtons} /> )} @@ -129,6 +132,7 @@ export default function FormExplorerSection({ initialParams={{}} handleSaveExplorer={handleSaveExplorer} setStep={setStep} + hideButtons={hideButtons} /> )} diff --git a/DashAI/front/src/components/notebooks/explorerCreation/ParameterStepExplorer.jsx b/DashAI/front/src/components/notebooks/explorerCreation/ParameterStepExplorer.jsx index d0739604a..11a8b54cd 100644 --- a/DashAI/front/src/components/notebooks/explorerCreation/ParameterStepExplorer.jsx +++ b/DashAI/front/src/components/notebooks/explorerCreation/ParameterStepExplorer.jsx @@ -10,6 +10,7 @@ export default function ParameterStepExplorer({ initialParams, handleSaveExplorer, setStep, + hideButtons = false, }) { const tourContext = useTourContext(); const { t } = useTranslation(["datasets"]); @@ -45,7 +46,15 @@ export default function ParameterStepExplorer({ }, []); return ( - + setStep(0)} saveButtonText={t("datasets:button.createExplorer")} + hideButtons={hideButtons} /> diff --git a/DashAI/front/src/components/notebooks/explorerCreation/ScopeStepExplorer.jsx b/DashAI/front/src/components/notebooks/explorerCreation/ScopeStepExplorer.jsx index 5e7c549af..7024b108e 100644 --- a/DashAI/front/src/components/notebooks/explorerCreation/ScopeStepExplorer.jsx +++ b/DashAI/front/src/components/notebooks/explorerCreation/ScopeStepExplorer.jsx @@ -11,6 +11,7 @@ export default function ScopeStepExplorer({ tool, setScopeColumns, nextStep, + hideButtons = false, }) { const theme = useTheme(); const [isSelectionValid, setIsSelectionValid] = useState(false); @@ -29,14 +30,14 @@ export default function ScopeStepExplorer({ sx={{ display: "flex", flexDirection: "column", - flexGrow: 1, + flex: 1, height: "100%", - gap: 1, + minHeight: 0, }} data-tour="column-selector-explorer-container" > {/* Content */} - + {/* Buttons */} - + {!hideButtons && ( - + )} ); } diff --git a/DashAI/front/src/components/notebooks/notebookCreation/UploadNotebookSteps.jsx b/DashAI/front/src/components/notebooks/notebookCreation/UploadNotebookSteps.jsx index 90e6fac0b..5ff191c8e 100644 --- a/DashAI/front/src/components/notebooks/notebookCreation/UploadNotebookSteps.jsx +++ b/DashAI/front/src/components/notebooks/notebookCreation/UploadNotebookSteps.jsx @@ -1,5 +1,5 @@ import { useState, useMemo, useEffect, useRef } from "react"; -import { Typography, TextField, Box, Button } from "@mui/material"; +import { Typography, TextField, Box } from "@mui/material"; import { useFormik } from "formik"; import DatasetAutocomplete from "./DatasetAutocomplete"; import { createNotebook } from "../../../api/notebook"; @@ -7,6 +7,7 @@ import { useSnackbar } from "notistack"; import NoteBox from "../NoteBox"; import { useTourContext } from "../../tour/TourProvider"; import { useTranslation } from "react-i18next"; +import StepperNavigationFooter from "../../shared/StepperNavigationFooter"; export default function UploadNotebookSteps({ backHome, @@ -153,30 +154,12 @@ export default function UploadNotebookSteps({ /> - - - - + ); } diff --git a/DashAI/front/src/components/notebooks/tool/ConfigureToolModal.jsx b/DashAI/front/src/components/notebooks/tool/ConfigureToolModal.jsx index 9464d5e82..5b1900615 100644 --- a/DashAI/front/src/components/notebooks/tool/ConfigureToolModal.jsx +++ b/DashAI/front/src/components/notebooks/tool/ConfigureToolModal.jsx @@ -21,6 +21,7 @@ import { getDatasetTypesByFilePath, } from "../../../api/datasets"; import { useTranslation } from "react-i18next"; +import StepperNavigationFooter from "../../shared/StepperNavigationFooter"; export default function ConfigureToolModal({ tool, @@ -148,11 +149,12 @@ export default function ConfigureToolModal({ flex: 1, display: "flex", flexDirection: "column", - overflow: "auto", + overflow: "hidden", p: 2, borderRight: "1px solid", borderColor: theme.palette.ui.borderDark, minWidth: 0, + minHeight: 0, }} > + )} - + ); } diff --git a/DashAI/front/src/components/shared/FormSchemaButtonGroup.jsx b/DashAI/front/src/components/shared/FormSchemaButtonGroup.jsx index 9046a6cdd..97526f46f 100644 --- a/DashAI/front/src/components/shared/FormSchemaButtonGroup.jsx +++ b/DashAI/front/src/components/shared/FormSchemaButtonGroup.jsx @@ -1,6 +1,6 @@ -import { Button, Box } from "@mui/material"; import PropTypes from "prop-types"; import { useTranslation } from "react-i18next"; +import StepperNavigationFooter from "./StepperNavigationFooter"; function FormSchemaButtonGroup({ onCancel, @@ -28,34 +28,29 @@ function FormSchemaButtonGroup({ ? "create-converter-button" : undefined); + const isFormValid = + !autoSave && Object.keys(formik?.errors ?? {}).length === 0 && !error; + + if (autoSave) { + return null; + } + return ( - - {onCancel && ( - - )} - {!autoSave && ( - - )} - + /> ); } diff --git a/DashAI/front/src/components/shared/FormSchemaParameterContainer.jsx b/DashAI/front/src/components/shared/FormSchemaParameterContainer.jsx index d04f1181b..7f08862b1 100644 --- a/DashAI/front/src/components/shared/FormSchemaParameterContainer.jsx +++ b/DashAI/front/src/components/shared/FormSchemaParameterContainer.jsx @@ -16,7 +16,8 @@ function FormSchemaParameterContainer({ children, showBorder = true }) { + {Boolean(propertyData?.parent) && ( <> diff --git a/DashAI/front/src/components/shared/StepperNavigationFooter.jsx b/DashAI/front/src/components/shared/StepperNavigationFooter.jsx new file mode 100644 index 000000000..2a7c44a10 --- /dev/null +++ b/DashAI/front/src/components/shared/StepperNavigationFooter.jsx @@ -0,0 +1,107 @@ +import { Box, Button, CircularProgress } from "@mui/material"; +import PropTypes from "prop-types"; +import { useTranslation } from "react-i18next"; + +/** + * Reusable navigation footer component for steppers, wizards, and multi-step flows. + * Provides consistent styling with sticky positioning and always-visible buttons. + * + * @param {function} onBack - Callback when back button is clicked + * @param {function} onNext - Callback when next/save button is clicked + * @param {boolean} backDisabled - Disable back button + * @param {boolean} nextDisabled - Disable next button + * @param {string} backLabel - Custom back button label (default: "back") + * @param {string} nextLabel - Custom next button label (default: "next") + * @param {boolean} showBack - Show back button (default: true) + * @param {boolean} showNext - Show next button (default: true) + * @param {string} variant - Next button variant: "next" or "save" (default: "next") + * @param {boolean} loading - Show loading state in next button + * @param {string} sx - Additional MUI sx styles for the footer + */ +export default function StepperNavigationFooter({ + onBack, + onNext, + backDisabled = false, + nextDisabled = false, + backLabel, + nextLabel, + showBack = true, + showNext = true, + variant = "next", + loading = false, + sx = {}, +}) { + const { t } = useTranslation(["common"]); + + const finalBackLabel = backLabel ?? t("common:back"); + const finalNextLabel = + nextLabel ?? (variant === "save" ? t("common:save") : t("common:next")); + + return ( + + {showBack && onBack && ( + + )} + {showNext && onNext && ( + + )} + + ); +} + +StepperNavigationFooter.propTypes = { + onBack: PropTypes.func, + onNext: PropTypes.func, + backDisabled: PropTypes.bool, + nextDisabled: PropTypes.bool, + backLabel: PropTypes.string, + nextLabel: PropTypes.string, + showBack: PropTypes.bool, + showNext: PropTypes.bool, + variant: PropTypes.oneOf(["next", "save"]), + loading: PropTypes.bool, + sx: PropTypes.object, +}; From d3225463be6bb9c3cc31f46d65c89249e714e31a Mon Sep 17 00:00:00 2001 From: Creylay Date: Wed, 6 May 2026 18:55:29 -0400 Subject: [PATCH 042/361] feat: update button styles to outlined for consistency in AddModelDialog --- DashAI/front/src/components/models/AddModelDialog.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DashAI/front/src/components/models/AddModelDialog.jsx b/DashAI/front/src/components/models/AddModelDialog.jsx index 965bbd261..6971f3f7c 100644 --- a/DashAI/front/src/components/models/AddModelDialog.jsx +++ b/DashAI/front/src/components/models/AddModelDialog.jsx @@ -381,11 +381,11 @@ function AddModelDialog({ - {activeStep > 0 && ( - )} From 7bd1a410b23b498d68e8ec169b0530f65378b239 Mon Sep 17 00:00:00 2001 From: Creylay Date: Wed, 6 May 2026 18:55:54 -0400 Subject: [PATCH 043/361] feat: standardize button styles to outlined and improve layout consistency across explainer modals and prediction panels --- .../explainers/InlineExplainerCreator.jsx | 13 ++-- .../explainers/NewGlobalExplainerModal.jsx | 39 ++++++----- .../explainers/NewLocalExplainerModal.jsx | 39 ++++++----- .../models/DatasetPredictionPanel.jsx | 9 ++- .../models/ManualPredictionPanel.jsx | 1 + .../src/components/models/RunResults.jsx | 64 +++++++++---------- .../predictions/ManualInputForm.jsx | 28 ++++---- 7 files changed, 100 insertions(+), 93 deletions(-) diff --git a/DashAI/front/src/components/explainers/InlineExplainerCreator.jsx b/DashAI/front/src/components/explainers/InlineExplainerCreator.jsx index d24543e8d..d85b93389 100644 --- a/DashAI/front/src/components/explainers/InlineExplainerCreator.jsx +++ b/DashAI/front/src/components/explainers/InlineExplainerCreator.jsx @@ -3,7 +3,6 @@ import PropTypes from "prop-types"; import { Box, Button, - ButtonGroup, Collapse, Paper, Step, @@ -356,8 +355,14 @@ export default function InlineExplainerCreator({ )} - - - + ); diff --git a/DashAI/front/src/components/explainers/NewGlobalExplainerModal.jsx b/DashAI/front/src/components/explainers/NewGlobalExplainerModal.jsx index 48dbd0211..ca8a16640 100644 --- a/DashAI/front/src/components/explainers/NewGlobalExplainerModal.jsx +++ b/DashAI/front/src/components/explainers/NewGlobalExplainerModal.jsx @@ -8,7 +8,6 @@ import { DialogActions, DialogContent, DialogTitle, - ButtonGroup, Stepper, Step, StepButton, @@ -347,27 +346,25 @@ export default function NewGlobalExplainerModal({ {/* Actions - Back and Next */} - - - + {activeStep === 0 ? t("common:close") : t("common:back")} + + + - - {activeStep === 1 ? t("common:save") : t("common:next")} - - - + {activeStep === 1 ? t("common:save") : t("common:next")} + + ); diff --git a/DashAI/front/src/components/explainers/NewLocalExplainerModal.jsx b/DashAI/front/src/components/explainers/NewLocalExplainerModal.jsx index a85e559fe..3926ffd2f 100644 --- a/DashAI/front/src/components/explainers/NewLocalExplainerModal.jsx +++ b/DashAI/front/src/components/explainers/NewLocalExplainerModal.jsx @@ -8,7 +8,6 @@ import { DialogActions, DialogContent, DialogTitle, - ButtonGroup, Stepper, Step, StepButton, @@ -360,27 +359,25 @@ export default function NewLocalExplainerModal({ {/* Actions - Back and Next */} - - - + {activeStep === 0 ? t("common:close") : t("common:back")} + + + - - {activeStep === 1 ? t("common:save") : t("common:next")} - - - + {activeStep === 1 ? t("common:save") : t("common:next")} + + ); diff --git a/DashAI/front/src/components/models/DatasetPredictionPanel.jsx b/DashAI/front/src/components/models/DatasetPredictionPanel.jsx index 912803624..2204ac817 100644 --- a/DashAI/front/src/components/models/DatasetPredictionPanel.jsx +++ b/DashAI/front/src/components/models/DatasetPredictionPanel.jsx @@ -173,7 +173,14 @@ export default function DatasetPredictionPanel({ setSelectedDataset={setSelectedDataset} /> - + - - + + + + )} - - - - - + + + + )} + + + + - - - - ); } From 8ba20c2b484c1484f1eba2712819583fdf8e0d06 Mon Sep 17 00:00:00 2001 From: Creylay Date: Wed, 6 May 2026 19:15:45 -0400 Subject: [PATCH 044/361] feat: standardize button widths in SessionVisualization for consistent layout --- .../src/components/models/SessionVisualization.jsx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/DashAI/front/src/components/models/SessionVisualization.jsx b/DashAI/front/src/components/models/SessionVisualization.jsx index 35ecf1783..393c4fe83 100644 --- a/DashAI/front/src/components/models/SessionVisualization.jsx +++ b/DashAI/front/src/components/models/SessionVisualization.jsx @@ -274,17 +274,19 @@ export default function SessionVisualization() { size="small" > {hasTrainMetrics && ( - + {t("common:train")} )} {hasValidationMetrics && ( - + {t("common:validation")} )} {hasTestMetrics && ( - {t("common:test")} + + {t("common:test")} + )} )} @@ -295,6 +297,7 @@ export default function SessionVisualization() { variant={showTable ? "contained" : "outlined"} onClick={() => handleToggleView(true)} startIcon={} + sx={{ width: 110 }} > {t("common:table")} @@ -303,6 +306,7 @@ export default function SessionVisualization() { variant={!showTable ? "contained" : "outlined"} onClick={() => handleToggleView(false)} startIcon={} + sx={{ width: 110 }} > {t("common:graphs")} From 43fd8185623f1dd877f837ef36daa429a7f716a6 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Thu, 7 May 2026 10:09:31 -0400 Subject: [PATCH 045/361] fix: update preview tests adding dataloader name --- tests/back/api/test_components_api.py | 38 +++++++++++++-------------- tests/back/types/inference_test.py | 15 +++++++++-- tests/back/types/load_preview_test.py | 14 +++++++--- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/tests/back/api/test_components_api.py b/tests/back/api/test_components_api.py index 9d501dbf0..1fe5994e6 100644 --- a/tests/back/api/test_components_api.py +++ b/tests/back/api/test_components_api.py @@ -191,7 +191,7 @@ def test_get_component_by_id(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -318,7 +318,7 @@ def test_get_components_select_only_dataloaders(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -328,7 +328,7 @@ def test_get_components_select_only_dataloaders(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -338,7 +338,7 @@ def test_get_components_select_only_dataloaders(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -443,7 +443,7 @@ def test_get_components_ignore_models(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -453,7 +453,7 @@ def test_get_components_ignore_models(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -463,7 +463,7 @@ def test_get_components_ignore_models(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -481,7 +481,7 @@ def test_get_components_ignore_tasks_and_models(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -491,7 +491,7 @@ def test_get_components_ignore_tasks_and_models(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -501,7 +501,7 @@ def test_get_components_ignore_tasks_and_models(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -605,7 +605,7 @@ def test_get_components_dataloader_component_parent(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -615,7 +615,7 @@ def test_get_components_dataloader_component_parent(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -650,7 +650,7 @@ def test_get_components_by_type_and_task(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -660,7 +660,7 @@ def test_get_components_by_type_and_task(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -694,7 +694,7 @@ def test_get_components_select_and_ignore_by_type(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -704,7 +704,7 @@ def test_get_components_select_and_ignore_by_type(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -714,7 +714,7 @@ def test_get_components_select_and_ignore_by_type(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -735,7 +735,7 @@ def test_get_components_select_type_and_parent(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, @@ -745,7 +745,7 @@ def test_get_components_select_type_and_parent(client: TestClient): "type": "DataLoader", "configurable_object": True, "schema": {}, - "metadata": {"category": "File Uploading"}, + "metadata": {"category": "File Uploading", "supported_extensions": []}, "description": None, "display_name": None, "color": None, diff --git a/tests/back/types/inference_test.py b/tests/back/types/inference_test.py index 4eb84b63c..c519ddad1 100644 --- a/tests/back/types/inference_test.py +++ b/tests/back/types/inference_test.py @@ -42,13 +42,24 @@ def test_inference_consistency(client: TestClient): pytest.skip(f"File {p.name} not found in {DATA_DIR}, skipping test.") inferred_csv = _infer( - client, csv_path, "text/csv", {"separator": ",", "methods": ["DashAIPtype"]} + client, + csv_path, + "text/csv", + { + "separator": ",", + "methods": ["DashAIPtype"], + "dataloader_name": "CSVDataLoader", + }, ) inferred_json = _infer( client, json_path, "application/json", - {"data_key": "data", "methods": ["DashAIPtype"]}, + { + "data_key": "data", + "methods": ["DashAIPtype"], + "dataloader_name": "JSONDataLoader", + }, ) # TEST API PROBLEM DOESNT LET USE XLSX SAME AS ABOVE. DOESN'T HAPPEN IN REAL USE. df_xslx = pd.read_excel(xslx_path) diff --git a/tests/back/types/load_preview_test.py b/tests/back/types/load_preview_test.py index 27f53052c..a5eae194a 100644 --- a/tests/back/types/load_preview_test.py +++ b/tests/back/types/load_preview_test.py @@ -65,7 +65,9 @@ def test_load_preview_csv(client, file, sep, expected_max_rows, expected_columns with path.open("rb") as f: files = {"file": (file, f, "text/csv")} - data = {"params": json.dumps({"separator": sep})} + data = { + "params": json.dumps({"separator": sep, "dataloader_name": "CSVDataLoader"}) + } resp = client.post("/api/v1/dataset/preview_with_types", data=data, files=files) assert resp.status_code == 200, resp.text @@ -101,7 +103,11 @@ def test_load_preview_json(client, file, datakey, expected_columns): with path.open("rb") as f: files = {"file": (file, f, "application/json")} - data = {"params": json.dumps({"data_key": datakey})} + data = { + "params": json.dumps( + {"data_key": datakey, "dataloader_name": "JSONDataLoader"} + ) + } resp = client.post("/api/v1/dataset/preview_with_types", data=data, files=files) assert resp.status_code == 200, resp.text @@ -124,7 +130,9 @@ def test_schema_change(client: TestClient): with path.open("rb") as f: files = {"file": ("iris.csv", f, "text/csv")} - data = {"params": json.dumps({"separator": ","})} + data = { + "params": json.dumps({"separator": ",", "dataloader_name": "CSVDataLoader"}) + } resp = client.post("/api/v1/dataset/preview_with_types", data=data, files=files) assert resp.status_code == 200, resp.text From 07c94c04f0420c111ac71df56ad19a3a1c7f9ea7 Mon Sep 17 00:00:00 2001 From: Creylay Date: Thu, 7 May 2026 11:19:17 -0400 Subject: [PATCH 046/361] feat: replace re-upload button with icon button for improved UI consistency in PreviewDataset --- .../datasetCreation/PreviewDataset.jsx | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx index 70ade95cc..7d356500e 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx @@ -1,6 +1,15 @@ import { useCallback, useEffect, useState, useRef } from "react"; import PropTypes from "prop-types"; -import { Box, Button, CircularProgress, Grid, Typography } from "@mui/material"; +import { + Box, + Button, + CircularProgress, + Grid, + IconButton, + Tooltip, + Typography, +} from "@mui/material"; +import { UploadFile as UploadFileIcon } from "@mui/icons-material"; import { useTheme } from "@mui/material/styles"; import { useSnackbar } from "notistack"; import { previewWithTypes } from "../../../api/datasets"; @@ -251,21 +260,11 @@ function PreviewDataset({ {t("datasets:label.changeColumnTypesInfo")} - + + + + + From 40122c4d177e6c9b149fc44a1383894a450b514d Mon Sep 17 00:00:00 2001 From: Irozuku Date: Wed, 6 May 2026 14:32:58 -0400 Subject: [PATCH 047/361] feat: add ARFFDataLoader for Weka ARFF files Implements ARFFDataLoader using scipy.io.arff, with automatic decoding of nominal byte-string attributes to UTF-8. Supports single file and ZIP split folder uploads. Includes ARFFTestDatasetGenerator and full test suite covering file load, split ZIP load, and bad format detection. --- .../dataloaders/classes/arff_dataloader.py | 176 ++++++++++++++++++ DashAI/back/initial_components.py | 4 +- .../back/dataloaders/test_arff_dataloader.py | 130 +++++++++++++ tests/back/test_datasets_generator.py | 56 ++++++ 4 files changed, 365 insertions(+), 1 deletion(-) create mode 100644 DashAI/back/dataloaders/classes/arff_dataloader.py create mode 100644 tests/back/dataloaders/test_arff_dataloader.py diff --git a/DashAI/back/dataloaders/classes/arff_dataloader.py b/DashAI/back/dataloaders/classes/arff_dataloader.py new file mode 100644 index 000000000..a44f9d14e --- /dev/null +++ b/DashAI/back/dataloaders/classes/arff_dataloader.py @@ -0,0 +1,176 @@ +"""DashAI ARFF Dataloader.""" + +import glob +import shutil +from typing import TYPE_CHECKING, Any, Dict + +from DashAI.back.core.schema_fields.base_schema import BaseSchema +from DashAI.back.core.utils import MultilingualString +from DashAI.back.dataloaders.classes.dataloader import BaseDataLoader + +if TYPE_CHECKING: + from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset + + +class ARFFDataloaderSchema(BaseSchema): + """Schema for ARFFDataLoader hyperparameters. + + ARFF files are self-describing; no parameters are required. + """ + + +class ARFFDataLoader(BaseDataLoader): + """Data loader that ingests tabular data from ARFF files into DashAI datasets. + + Reads Weka ARFF files using scipy, decodes nominal attributes from bytes + to UTF-8 strings, and converts the result into DashAI datasets. Handles + multi-file uploads via ZIP archives containing train/test/val split folders. + """ + + SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".arff"}) + COMPATIBLE_COMPONENTS = ["TabularClassificationTask"] + SCHEMA = ARFFDataloaderSchema + + DESCRIPTION: str = MultilingualString( + en=( + "Data loader for tabular data in ARFF files " + "(Weka Attribute-Relation File Format). " + "ARFF files are self-describing and require no additional parameters." + ), + es=( + "Cargador de datos para datos tabulares en archivos ARFF " + "(formato Weka Attribute-Relation File Format). " + "Los archivos ARFF son autodescriptivos y no requieren " + "parámetros adicionales." + ), + ) + DISPLAY_NAME: str = MultilingualString( + en="ARFF Data Loader", + es="Cargador de Datos ARFF", + ) + + def _read_arff_file(self, filepath: str): + """Read an ARFF file and return a pandas DataFrame. + + Parameters + ---------- + filepath : str + Path to the ARFF file. + + Returns + ------- + pd.DataFrame + DataFrame with nominal columns decoded from bytes to UTF-8. + + Raises + ------ + datasets.builder.DatasetGenerationError + If the file cannot be parsed as valid ARFF. + """ + import pandas as pd + from datasets.builder import DatasetGenerationError + from scipy.io import arff + + try: + data, _ = arff.loadarff(filepath) + except Exception as e: + raise DatasetGenerationError from e + + arff_df = pd.DataFrame(data) + for col in arff_df.columns: + if arff_df[col].dtype == object: + arff_df[col] = arff_df[col].str.decode("utf-8") + return arff_df + + def load_data( + self, + filepath_or_buffer: str, + temp_path: str, + params: Dict[str, Any], + n_sample: int | None = None, + ) -> "DashAIDataset": + """Load uploaded ARFF files into a DatasetDict. + + Parameters + ---------- + filepath_or_buffer : str + Path or URL to an ARFF file or a ZIP archive with split folders. + temp_path : str + Temporary directory for file extraction. + params : Dict[str, Any] + Dataloader parameters (unused; ARFF is self-describing). + n_sample : int | None + Maximum rows to load, or None for all. + + Returns + ------- + DashAIDataset + Dataset with loaded data. + """ + import pandas as pd + from datasets import Dataset, DatasetDict + + from DashAI.back.dataloaders.classes.dashai_dataset import to_dashai_dataset + + prepared_path = self.prepare_files(filepath_or_buffer, temp_path) + + if prepared_path[1] == "file": + arff_df = self._read_arff_file(prepared_path[0]) + if n_sample is not None: + arff_df = arff_df.head(n_sample) + dataset_dict = DatasetDict( + {"train": Dataset.from_pandas(arff_df, preserve_index=False)} + ) + else: + train_files = glob.glob(prepared_path[0] + "/train/*") + test_files = glob.glob(prepared_path[0] + "/test/*") + val_files = glob.glob(prepared_path[0] + "/val/*") + glob.glob( + prepared_path[0] + "/validation/*" + ) + try: + train_df = pd.concat( + [self._read_arff_file(f) for f in sorted(train_files)] + ) + test_df = pd.concat( + [self._read_arff_file(f) for f in sorted(test_files)] + ) + val_df = pd.concat([self._read_arff_file(f) for f in sorted(val_files)]) + if n_sample is not None: + train_df = train_df.head(n_sample) + test_df = test_df.head(n_sample) + val_df = val_df.head(n_sample) + dataset_dict = DatasetDict( + { + "train": Dataset.from_pandas(train_df, preserve_index=False), + "test": Dataset.from_pandas(test_df, preserve_index=False), + "validation": Dataset.from_pandas(val_df, preserve_index=False), + } + ) + finally: + shutil.rmtree(prepared_path[0]) + return to_dashai_dataset(dataset_dict) + + def load_preview( + self, + filepath_or_buffer: str, + params: Dict[str, Any], + n_rows: int = 100, + ): + """Load a preview of the ARFF dataset. + + Parameters + ---------- + filepath_or_buffer : str + Path to the ARFF file. + params : Dict[str, Any] + Unused parameters. + n_rows : int, optional + Maximum rows to return. Default is 100. + + Returns + ------- + pd.DataFrame + Preview DataFrame. + """ + arff_df = self._read_arff_file(filepath_or_buffer) + return arff_df.head(n_rows) diff --git a/DashAI/back/initial_components.py b/DashAI/back/initial_components.py index 3ab31b82f..269730dac 100644 --- a/DashAI/back/initial_components.py +++ b/DashAI/back/initial_components.py @@ -62,6 +62,7 @@ from DashAI.back.converters.simple_converters.nan_remover import NanRemover # DataLoaders +from DashAI.back.dataloaders.classes.arff_dataloader import ARFFDataLoader from DashAI.back.dataloaders.classes.csv_dataloader import CSVDataLoader from DashAI.back.dataloaders.classes.excel_dataloader import ExcelDataLoader from DashAI.back.dataloaders.classes.json_dataloader import JSONDataLoader @@ -367,9 +368,10 @@ def get_initial_components(): XlmRobertaTransformer, XlnetTransformer, # Dataloaders + ARFFDataLoader, CSVDataLoader, - JSONDataLoader, ExcelDataLoader, + JSONDataLoader, # Metrics F1, Accuracy, diff --git a/tests/back/dataloaders/test_arff_dataloader.py b/tests/back/dataloaders/test_arff_dataloader.py new file mode 100644 index 000000000..152445cd0 --- /dev/null +++ b/tests/back/dataloaders/test_arff_dataloader.py @@ -0,0 +1,130 @@ +"""ARFF DataLoader tests module.""" + +import pathlib +from typing import Any, Dict + +import pytest +from sklearn.datasets import load_diabetes, load_iris, load_wine + +from DashAI.back.dataloaders.classes.arff_dataloader import ARFFDataLoader +from tests.back.dataloaders.base_tabular_dataloader_tests import ( + BaseTabularDataLoaderTester, +) +from tests.back.test_datasets_generator import ARFFTestDatasetGenerator + + +@pytest.fixture(scope="module", autouse=True) +def _setup(test_datasets_path: pathlib.Path, random_state: int) -> None: + """Generate the ARFF test datasets.""" + df_iris = load_iris(return_X_y=False, as_frame=True)["frame"] # type: ignore + df_wine = load_wine(return_X_y=False, as_frame=True)["frame"] # type: ignore + df_diabetes = load_diabetes(return_X_y=False, as_frame=True)["frame"] # type: ignore + + for df, name in [(df_iris, "iris"), (df_wine, "wine"), (df_diabetes, "diabetes")]: + ARFFTestDatasetGenerator( + df=df, + dataset_name=name, + ouptut_path=test_datasets_path, + random_state=random_state, + ) + + +class TestARFFDataloader(BaseTabularDataLoaderTester): + @property + def dataloader_cls(self): + return ARFFDataLoader + + @property + def data_type_name(self): + return "arff" + + @pytest.mark.parametrize( + ("dataset_path", "params", "nrows", "ncols"), + [ + ("iris/basic.arff", {}, 150, 5), + ("wine/basic.arff", {}, 178, 14), + ("diabetes/basic.arff", {}, 442, 11), + ], + ids=[ + "test_load_arff_iris", + "test_load_arff_wine", + "test_load_arff_diabetes", + ], + ) + def test_load_data_from_file( + self, + test_datasets_path: pathlib.Path, + dataset_path: str, + params: Dict[str, Any], + nrows: int, + ncols: int, + ) -> None: + super()._test_load_data_from_file( + dataset_path=test_datasets_path / self.data_type_name / dataset_path, + params=params, + nrows=nrows, + ncols=ncols, + ) + + @pytest.mark.parametrize( + ( + "dataset_path", + "params", + "train_nrows", + "test_nrows", + "val_nrows", + "ncols", + ), + [ + ("iris/split.zip", {}, 50, 50, 50, 5), + ("wine/split.zip", {}, 60, 60, 60, 14), + ("diabetes/split.zip", {}, 148, 148, 148, 11), + ], + ids=[ + "test_load_arff_iris_from_split_zip", + "test_load_arff_wine_from_split_zip", + "test_load_arff_diabetes_from_split_zip", + ], + ) + def test_load_data_from_zip( + self, + test_datasets_path: pathlib.Path, + dataset_path: str, + params: Dict[str, Any], + train_nrows: int, + test_nrows: int, + val_nrows: int, + ncols: int, + ): + super()._test_load_data_from_zip( + dataset_path=test_datasets_path / self.data_type_name / dataset_path, + params=params, + train_nrows=train_nrows, + test_nrows=test_nrows, + val_nrows=val_nrows, + ncols=ncols, + ) + + @pytest.mark.parametrize( + ("dataset_path", "params"), + [ + ("iris/bad_format.arff", {}), + ("wine/bad_format.arff", {}), + ("diabetes/bad_format.arff", {}), + ], + ids=[ + "test_load_arff_iris_with_bad_format", + "test_load_arff_wine_with_bad_format", + "test_load_arff_diabetes_with_bad_format", + ], + ) + def test_dataloader_try_to_load_a_invalid_datasets( + self, + test_datasets_path: pathlib.Path, + dataset_path: str, + params: Dict[str, Any], + ): + super()._test_dataloader_try_to_load_a_invalid_datasets( + dataset_path=test_datasets_path / self.data_type_name / dataset_path, + params=params, + ) diff --git a/tests/back/test_datasets_generator.py b/tests/back/test_datasets_generator.py index 98036af15..2d52e43e7 100644 --- a/tests/back/test_datasets_generator.py +++ b/tests/back/test_datasets_generator.py @@ -334,6 +334,62 @@ def _gernerate_splits( ) +class ARFFTestDatasetGenerator: + def __init__( + self, + df: pd.DataFrame, + dataset_name: str, + ouptut_path: pathlib.Path, + random_state: int, + ) -> None: + base_path = pathlib.Path(ouptut_path) / "arff" / dataset_name + os.makedirs(base_path, exist_ok=True) + + self._generate_common_cases(base_path=base_path, df=df) + self._generate_bad_formats(base_path=base_path) + self._generate_splits(base_path=base_path, df=df, random_state=random_state) + + @staticmethod + def _write_arff( + df: pd.DataFrame, filepath: pathlib.Path, relation: str = "dataset" + ): + with open(filepath, "w") as f: + f.write(f"@relation {relation}\n\n") + for col in df.columns: + safe_col = f"'{col}'" + if pd.api.types.is_numeric_dtype(df[col]): + f.write(f"@attribute {safe_col} NUMERIC\n") + else: + values = ",".join(f"'{v}'" for v in df[col].unique()) + f.write(f"@attribute {safe_col} {{{values}}}\n") + f.write("\n@data\n") + for _, row in df.iterrows(): + f.write(",".join(str(v) for v in row) + "\n") + + def _generate_common_cases(self, base_path: pathlib.Path, df: pd.DataFrame): + self._write_arff(df, base_path / "basic.arff") + + def _generate_bad_formats(self, base_path: pathlib.Path): + with open(base_path / "bad_format.arff", "wb") as f: + f.write(b"not a valid arff file #$%&--") + with open(base_path / "empty_file.arff", "w") as f: + f.write("") + + def _generate_splits( + self, base_path: pathlib.Path, df: pd.DataFrame, random_state: int + ): + os.makedirs(base_path / "split" / "train", exist_ok=True) + os.makedirs(base_path / "split" / "test", exist_ok=True) + os.makedirs(base_path / "split" / "val", exist_ok=True) + + train, test, val = _get_test_splits(df, random_state) + + self._write_arff(train, base_path / "split" / "train" / "train.arff") + self._write_arff(test, base_path / "split" / "test" / "test.arff") + self._write_arff(val, base_path / "split" / "val" / "val.arff") + shutil.make_archive(str(base_path / "split"), "zip", base_path / "split") + + class ExcelTestDatasetGenerator: def __init__( self, From e62337a8c5cc56d348bef25f1a7ec426cce2fd73 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Thu, 7 May 2026 11:14:37 -0400 Subject: [PATCH 048/361] fix: add .zip to ARFFDataLoader SUPPORTED_EXTENSIONS --- DashAI/back/dataloaders/classes/arff_dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DashAI/back/dataloaders/classes/arff_dataloader.py b/DashAI/back/dataloaders/classes/arff_dataloader.py index a44f9d14e..30b07e79a 100644 --- a/DashAI/back/dataloaders/classes/arff_dataloader.py +++ b/DashAI/back/dataloaders/classes/arff_dataloader.py @@ -27,7 +27,7 @@ class ARFFDataLoader(BaseDataLoader): multi-file uploads via ZIP archives containing train/test/val split folders. """ - SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".arff"}) + SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".arff", ".zip"}) COMPATIBLE_COMPONENTS = ["TabularClassificationTask"] SCHEMA = ARFFDataloaderSchema From f281505c736ffb2b86a274c6833318910b6fcdcb Mon Sep 17 00:00:00 2001 From: Creylay Date: Thu, 7 May 2026 13:14:33 -0400 Subject: [PATCH 049/361] feat: enhance button styles and improve file input handling for consistency in dataset upload --- .../datasetCreation/PreviewDataset.jsx | 20 ++++++++++++++++-- .../notebooks/datasetCreation/Upload.jsx | 21 +++++++++---------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx index 7d356500e..147d30e0c 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/PreviewDataset.jsx @@ -261,8 +261,24 @@ function PreviewDataset({ - - + + diff --git a/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx b/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx index d506b2663..5572f0533 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/Upload.jsx @@ -154,7 +154,7 @@ function Upload({ }; const handleSelect = async (e) => { - if (datasetState !== EMPTY) return; + if (datasetState !== EMPTY && datasetState !== LOADED) return; const f = e.target.files && e.target.files[0]; if (!f) return; @@ -267,15 +267,6 @@ function Upload({ case EMPTY: return ( - - - {dragActive ? ( { e.stopPropagation(); - handleDeleteDataset(); + inputRef.current?.click(); }} onPreviewError={onPreviewError} onTypesChanged={onTypesChanged} @@ -378,6 +369,14 @@ function Upload({ + + {/* Drag and drop */} Date: Thu, 7 May 2026 13:42:20 -0400 Subject: [PATCH 050/361] feat: add highlighting for newly added run cards in session visualization --- .../src/components/models/ModelsContext.jsx | 4 ++ .../models/SessionVisualization.jsx | 41 ++++++++++++++++++- DashAI/front/src/hooks/models/useSessions.js | 6 +++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/DashAI/front/src/components/models/ModelsContext.jsx b/DashAI/front/src/components/models/ModelsContext.jsx index 198c160a8..fc8da6802 100644 --- a/DashAI/front/src/components/models/ModelsContext.jsx +++ b/DashAI/front/src/components/models/ModelsContext.jsx @@ -68,6 +68,8 @@ export function ModelsProvider({ children }) { onEditRun, handleCancelRetrain, handleConfirmRetrain, + lastAddedRunId, + clearLastAddedRunId, } = useSessions({ t }); const [selectedModel, setSelectedModel] = useState(null); @@ -149,6 +151,8 @@ export function ModelsProvider({ children }) { onDeleteRun, handleCancelRetrain, handleConfirmRetrain, + lastAddedRunId, + clearLastAddedRunId, datasetInfo, setDatasetInfo, datasetTab, diff --git a/DashAI/front/src/components/models/SessionVisualization.jsx b/DashAI/front/src/components/models/SessionVisualization.jsx index 35ecf1783..6800ffb78 100644 --- a/DashAI/front/src/components/models/SessionVisualization.jsx +++ b/DashAI/front/src/components/models/SessionVisualization.jsx @@ -13,6 +13,7 @@ import { IconButton, Tooltip, } from "@mui/material"; +import { useTheme, alpha } from "@mui/material/styles"; import { PlayArrow, TableChart, @@ -33,6 +34,7 @@ import { useTourContext } from "../tour/TourProvider"; export default function SessionVisualization() { const [models, setModels] = useState([]); const [selectedRunId, setSelectedRunId] = useState(null); + const [highlightedRunId, setHighlightedRunId] = useState(null); const [tableHeight, setTableHeight] = useState(280); const [showTable, setShowTable] = useState(true); const [previousTableHeight, setPreviousTableHeight] = useState(280); @@ -54,8 +56,14 @@ export default function SessionVisualization() { operationsCount, handleCancelRetrain, handleConfirmRetrain, + lastAddedRunId, + clearLastAddedRunId, } = useModels(); + const theme = useTheme(); + const glowStart = alpha(theme.palette.primary.main, 0.65); + const glowMid = alpha(theme.palette.primary.main, 0.2); + // Auto-expand when switching to graphs const handleToggleView = React.useCallback( (isTable) => { @@ -112,6 +120,24 @@ export default function SessionVisualization() { } }, [sessionTourContext]); + // Scroll to and highlight a newly added run card + useEffect(() => { + if (!lastAddedRunId) return; + const scrollTimer = setTimeout(() => { + const element = document.getElementById(`run-card-${lastAddedRunId}`); + if (element) { + element.scrollIntoView({ behavior: "smooth", block: "nearest" }); + } + setHighlightedRunId(lastAddedRunId); + clearLastAddedRunId(); + }, 100); + const clearTimer = setTimeout(() => setHighlightedRunId(null), 4100); + return () => { + clearTimeout(scrollTimer); + clearTimeout(clearTimer); + }; + }, [lastAddedRunId]); + const handleRowClick = React.useCallback((runId) => { setSelectedRunId(runId); const element = document.getElementById(`run-card-${runId}`); @@ -444,7 +470,20 @@ export default function SessionVisualization() { } sx={{ scrollMarginTop: "20px", - transition: "all 0.3s ease", + transition: "transform 0.3s ease", + "@keyframes newRunHighlight": { + "0%": { + boxShadow: `0 0 0 3px ${glowStart}, 0 0 24px 8px ${glowMid}`, + }, + "65%": { + boxShadow: `0 0 0 1px ${alpha(theme.palette.primary.main, 0.15)}`, + }, + "100%": { boxShadow: "none" }, + }, + animation: + highlightedRunId === run.id + ? "newRunHighlight 4s ease-out forwards" + : "none", ...(selectedRunId === run.id && { transform: "scale(1.02)", boxShadow: 3, diff --git a/DashAI/front/src/hooks/models/useSessions.js b/DashAI/front/src/hooks/models/useSessions.js index 59cca91fb..c86e6dbec 100644 --- a/DashAI/front/src/hooks/models/useSessions.js +++ b/DashAI/front/src/hooks/models/useSessions.js @@ -30,6 +30,7 @@ export function useSessions({ t }) { const [retrainDialogOpen, setRetrainDialogOpen] = useState(false); const [runToRetrain, setRunToRetrain] = useState(null); const [operationsCount, setOperationsCount] = useState(null); + const [lastAddedRunId, setLastAddedRunId] = useState(null); // -------- actions -------- @@ -207,11 +208,14 @@ export function useSessions({ t }) { const onRunCreated = (newRun) => { setRuns((prev) => [...prev, newRun]); + setLastAddedRunId(newRun.id); enqueueSnackbar(t("models:message.runAdded", { runName: newRun.name }), { variant: "success", }); }; + const clearLastAddedRunId = useCallback(() => setLastAddedRunId(null), []); + const onTrainRun = async (run) => { try { // Only show confirmation dialog for previously trained runs (Retrain flow) @@ -308,5 +312,7 @@ export function useSessions({ t }) { onDeleteRun, handleCancelRetrain, handleConfirmRetrain, + lastAddedRunId, + clearLastAddedRunId, }; } From a85819909cf0f59cafdbf395ba84582a3da31a38 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Thu, 7 May 2026 17:47:16 -0400 Subject: [PATCH 051/361] feat: add tool prop and row-count warning alert to ColumnSelector --- .../components/notebooks/ColumnSelector.jsx | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/DashAI/front/src/components/notebooks/ColumnSelector.jsx b/DashAI/front/src/components/notebooks/ColumnSelector.jsx index 6ff78a76a..fd1594fc8 100644 --- a/DashAI/front/src/components/notebooks/ColumnSelector.jsx +++ b/DashAI/front/src/components/notebooks/ColumnSelector.jsx @@ -1,6 +1,6 @@ import { useState, useEffect, useCallback, useMemo } from "react"; import PropTypes from "prop-types"; -import { Box, Typography } from "@mui/material"; +import { Alert, Box, Typography } from "@mui/material"; import { MaterialReactTable, useMaterialReactTable, @@ -36,6 +36,7 @@ import { Trans, useTranslation } from "react-i18next"; */ function ColumnSelector({ file_path, + tool, inputCardinality = {}, allowedDtypes = [], allowedTypes = [], @@ -337,8 +338,8 @@ function ColumnSelector({ context: inputCardinality.exact ? "exact" : inputCardinality.max - ? "range" - : "min", + ? "range" + : "min", })} )} @@ -394,7 +395,26 @@ function ColumnSelector({ )} - {" "} + + + {tool?.metadata?.changes_row_count && ( + `${theme.palette.warning.main}40`, + border: (theme) => `1px solid ${theme.palette.warning.main}`, + "& \t.MuiAlert-message": { + display: "flex", + alignItems: "center", + }, + mb: 1.5, + }} + > + {t("datasets:message.changesRowCountWarning")} + + )} + {/* Data Grid */} Date: Thu, 7 May 2026 17:47:35 -0400 Subject: [PATCH 052/361] refactor: replace changes_row_count() method with CHANGES_ROW_COUNT class attribute --- DashAI/back/converters/base_converter.py | 15 ++------------- .../back/converters/imbalanced_learn_wrapper.py | 11 +---------- .../simple_converters/character_replacer.py | 10 ---------- .../converters/simple_converters/nan_remover.py | 11 +---------- DashAI/back/job/converter_job.py | 2 +- 5 files changed, 5 insertions(+), 44 deletions(-) diff --git a/DashAI/back/converters/base_converter.py b/DashAI/back/converters/base_converter.py index 470acce3f..4fe0db33d 100644 --- a/DashAI/back/converters/base_converter.py +++ b/DashAI/back/converters/base_converter.py @@ -41,6 +41,7 @@ class BaseConverter(ConfigObject, ABC): ICON: Final[str] = Icon.Extension.value COLOR: Final[str] = "rgb(255, 255, 255)" SUPERVISED: bool = False + CHANGES_ROW_COUNT: bool = False SCHEMA: BaseConverterSchema @classmethod @@ -70,6 +71,7 @@ def get_metadata(cls) -> Dict[str, Any]: meta["icon"] = cls.ICON if cls.ICON else Icon.Extension.value meta["color"] = cls.COLOR if cls.COLOR else "rgb(255, 255, 255)" meta["supervised"] = cls.SUPERVISED + meta["changes_row_count"] = cls.CHANGES_ROW_COUNT # Serialize allowed_types class references → class name strings for the frontend raw_types = meta.get("allowed_types", []) @@ -84,19 +86,6 @@ def get_metadata(cls) -> Dict[str, Any]: return meta - def changes_row_count(self) -> bool: - """Indicate whether this converter changes the number of dataset rows. - - Samplers (e.g. SMOTE, RandomUnderSampler) return True because they - add or remove rows. Most transformers return False. - - Returns - ------- - bool - True if the converter may add or remove rows, False otherwise. - """ - return False - @abstractmethod def get_output_type(self, column_name: str = None) -> DashAIDataType: """Return the DashAI data type produced by this converter for a given column. diff --git a/DashAI/back/converters/imbalanced_learn_wrapper.py b/DashAI/back/converters/imbalanced_learn_wrapper.py index 3c2f03c11..a8f08e7ba 100644 --- a/DashAI/back/converters/imbalanced_learn_wrapper.py +++ b/DashAI/back/converters/imbalanced_learn_wrapper.py @@ -23,6 +23,7 @@ class ImbalancedLearnWrapper(BaseConverter, metaclass=ABCMeta): """ SUPERVISED = True + CHANGES_ROW_COUNT = True def __init__(self, **kwargs): """Initialise the imbalanced-learn wrapper and reset internal state. @@ -38,16 +39,6 @@ def __init__(self, **kwargs): self.original_X_column_names_: list = [] self.original_target_column_name_: str = "" - def changes_row_count(self) -> bool: - """Return ``True`` because all samplers add or remove rows. - - Returns - ------- - bool - Always ``True``. - """ - return True - def get_output_type(self, column_name: str = None) -> DashAIDataType: """Not implemented; type preservation is handled in ``transform``. diff --git a/DashAI/back/converters/simple_converters/character_replacer.py b/DashAI/back/converters/simple_converters/character_replacer.py index 531ad6a52..c4e66654f 100644 --- a/DashAI/back/converters/simple_converters/character_replacer.py +++ b/DashAI/back/converters/simple_converters/character_replacer.py @@ -253,16 +253,6 @@ def replace_function(batch): splits=x.splits, ) - def changes_row_count(self) -> bool: - """Return ``False`` because this converter never adds or removes rows. - - Returns - ------- - bool - Always ``False``. - """ - return False - def get_output_type(self, column_name: str = None) -> DashAIDataType: """Return the default output type for a transformed column. diff --git a/DashAI/back/converters/simple_converters/nan_remover.py b/DashAI/back/converters/simple_converters/nan_remover.py index 4f9861e54..c9570ae04 100644 --- a/DashAI/back/converters/simple_converters/nan_remover.py +++ b/DashAI/back/converters/simple_converters/nan_remover.py @@ -41,6 +41,7 @@ class NanRemover(BasicPreprocessingConverter, BaseConverter): """ SCHEMA = NanRemoverSchema + CHANGES_ROW_COUNT = True DESCRIPTION = MultilingualString( en=( "Removes the rows with NaN values from the dataset. Keep in mind that " @@ -178,16 +179,6 @@ def transform( return to_dashai_dataset(cleaned_dataset, types=preserved_types) - def changes_row_count(self) -> bool: - """Return ``True`` because this converter removes rows with null values. - - Returns - ------- - bool - Always ``True``. - """ - return True - def get_output_type(self, column_name: str = None) -> DashAIDataType: """Return the preserved type for a column, or a Text placeholder. diff --git a/DashAI/back/job/converter_job.py b/DashAI/back/job/converter_job.py index 33c0a84ea..a35b60121 100644 --- a/DashAI/back/job/converter_job.py +++ b/DashAI/back/job/converter_job.py @@ -382,7 +382,7 @@ def instantiate_converters( f"Error transforming data with {converter_name}: {e}" ) from e - if converter_instance.changes_row_count(): + if type(converter_instance).CHANGES_ROW_COUNT: loaded_dataset = transformed_dataset else: loaded_dataset = _rebuild_dataset_with_transformed_columns( From 88441c7b3ff915d0671bed3c4269381a21b14fe4 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Thu, 7 May 2026 17:47:39 -0400 Subject: [PATCH 053/361] feat: pass tool to ColumnSelector and add changesRowCountWarning i18n key --- .../notebooks/converterCreation/ScopeStepConverter.jsx | 1 + DashAI/front/src/utils/i18n/locales/en/datasets.json | 1 + DashAI/front/src/utils/i18n/locales/es/datasets.json | 1 + 3 files changed, 3 insertions(+) diff --git a/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx b/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx index 83e1f7f35..30432e450 100644 --- a/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx +++ b/DashAI/front/src/components/notebooks/converterCreation/ScopeStepConverter.jsx @@ -110,6 +110,7 @@ export default function ScopeStepConverter({ {/* Scope selection UI */} { diff --git a/DashAI/front/src/utils/i18n/locales/en/datasets.json b/DashAI/front/src/utils/i18n/locales/en/datasets.json index 489c06130..019f426c3 100644 --- a/DashAI/front/src/utils/i18n/locales/en/datasets.json +++ b/DashAI/front/src/utils/i18n/locales/en/datasets.json @@ -368,6 +368,7 @@ }, "message": { "columnTypesUpdated": "Column types updated successfully", + "changesRowCountWarning": "This converter modifies the number of rows in the dataset (adds or removes rows). Note that only the selected columns will be kept. Any columns not included in the scope will be removed from the dataset.", "converterCreated": "Converter {{name}} created successfully", "converterProcessed": "Converter {{name}} processed successfully", "datasetCreationStarted": "Dataset creation started", diff --git a/DashAI/front/src/utils/i18n/locales/es/datasets.json b/DashAI/front/src/utils/i18n/locales/es/datasets.json index 27b22f00c..ccb0f4851 100644 --- a/DashAI/front/src/utils/i18n/locales/es/datasets.json +++ b/DashAI/front/src/utils/i18n/locales/es/datasets.json @@ -374,6 +374,7 @@ }, "message": { "columnTypesUpdated": "Tipos de columnas actualizados exitosamente", + "changesRowCountWarning": "Este convertidor modifica la cantidad de filas del dataset (agrega o elimina filas). Ten en cuenta que solo se conservarán las columnas seleccionadas. Las columnas no incluidas en el alcance serán eliminadas del dataset.", "converterCreated": "Convertidor {{name}} creado exitosamente", "converterProcessed": "Convertidor {{name}} procesado exitosamente", "datasetCreationStarted": "Creación de dataset iniciada", From 232993747031f858e59208e7b8a550dc16c6df5d Mon Sep 17 00:00:00 2001 From: Irozuku Date: Thu, 7 May 2026 17:53:56 -0400 Subject: [PATCH 054/361] fix: correct indentation in ColumnSelector component with pre-commit --- DashAI/front/src/components/notebooks/ColumnSelector.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DashAI/front/src/components/notebooks/ColumnSelector.jsx b/DashAI/front/src/components/notebooks/ColumnSelector.jsx index fd1594fc8..9c0d1c612 100644 --- a/DashAI/front/src/components/notebooks/ColumnSelector.jsx +++ b/DashAI/front/src/components/notebooks/ColumnSelector.jsx @@ -338,8 +338,8 @@ function ColumnSelector({ context: inputCardinality.exact ? "exact" : inputCardinality.max - ? "range" - : "min", + ? "range" + : "min", })} )} From d023bde72e61713650bdee22156fe7cf34b535af Mon Sep 17 00:00:00 2001 From: Creylay Date: Fri, 8 May 2026 10:31:07 -0400 Subject: [PATCH 055/361] feat: implement highlighting for newly added explorers and converters in notebook view --- .../context/ExplorersAndConvertersContext.jsx | 7 +- .../notebooks/converter/ConverterBox.jsx | 17 +++- .../FormConverterSection.jsx | 8 +- .../notebooks/explorer/ExplorerBox.jsx | 17 +++- .../explorerCreation/FormExplorerSection.jsx | 8 +- .../notebooks/notebook/NotebookView.jsx | 85 +++++++++++++++---- 6 files changed, 118 insertions(+), 24 deletions(-) diff --git a/DashAI/front/src/components/notebooks/context/ExplorersAndConvertersContext.jsx b/DashAI/front/src/components/notebooks/context/ExplorersAndConvertersContext.jsx index 5e6a7e7a3..1a1ad06d1 100644 --- a/DashAI/front/src/components/notebooks/context/ExplorersAndConvertersContext.jsx +++ b/DashAI/front/src/components/notebooks/context/ExplorersAndConvertersContext.jsx @@ -1,4 +1,4 @@ -import React, { createContext, useContext, useState } from "react"; +import React, { createContext, useContext, useState, useCallback } from "react"; const ExplorersAndConvertersContext = createContext(); @@ -7,10 +7,15 @@ export const useExplorersAndConverters = () => export const ExplorersAndConvertersProvider = ({ children }) => { const [explorersAndConverters, setExplorersAndConverters] = useState([]); + const [lastAddedItemId, setLastAddedItemId] = useState(null); + const clearLastAddedItemId = useCallback(() => setLastAddedItemId(null), []); const value = { explorersAndConverters, setExplorersAndConverters, + lastAddedItemId, + setLastAddedItemId, + clearLastAddedItemId, }; return ( diff --git a/DashAI/front/src/components/notebooks/converter/ConverterBox.jsx b/DashAI/front/src/components/notebooks/converter/ConverterBox.jsx index ac4c618d6..8b175fad8 100644 --- a/DashAI/front/src/components/notebooks/converter/ConverterBox.jsx +++ b/DashAI/front/src/components/notebooks/converter/ConverterBox.jsx @@ -8,7 +8,7 @@ import { CircularProgress, IconButton, } from "@mui/material"; -import { useTheme } from "@mui/material/styles"; +import { useTheme, alpha } from "@mui/material/styles"; import { Delete } from "@mui/icons-material"; import { MaterialReactTable, @@ -73,6 +73,7 @@ export default function ConverterBox({ converter, onStatusChange, handleConverterDeleteClick, + isHighlighted = false, }) { const theme = useTheme(); const [converterComponent, setConverterComponent] = useState({}); @@ -135,6 +136,20 @@ export default function ConverterBox({ bgcolor: theme.palette.background.box, borderRadius: 2, height: "100%", + position: "relative", + zIndex: isHighlighted ? 1 : 0, + "@keyframes converterHighlight": { + "0%": { + boxShadow: `0 0 0 3px ${alpha(theme.palette.primary.main, 0.65)}, 0 0 24px 8px ${alpha(theme.palette.primary.main, 0.2)}`, + }, + "65%": { + boxShadow: `0 0 0 1px ${alpha(theme.palette.primary.main, 0.15)}`, + }, + "100%": { boxShadow: "none" }, + }, + animation: isHighlighted + ? "converterHighlight 4s ease-out forwards" + : "none", }} > { const data = { ...response, type: "converter" }; setExplorersAndConverters((prev) => [...prev, data]); + setLastAddedItemId(data.id); enqueueSnackbar( t("datasets:message.converterCreated", { name: tool.name }), { diff --git a/DashAI/front/src/components/notebooks/explorer/ExplorerBox.jsx b/DashAI/front/src/components/notebooks/explorer/ExplorerBox.jsx index 8b5b9ed45..bf6d0b072 100644 --- a/DashAI/front/src/components/notebooks/explorer/ExplorerBox.jsx +++ b/DashAI/front/src/components/notebooks/explorer/ExplorerBox.jsx @@ -9,7 +9,7 @@ import { CircularProgress, Button, } from "@mui/material"; -import { useTheme } from "@mui/material/styles"; +import { useTheme, alpha } from "@mui/material/styles"; import { Analytics, Info, Delete } from "@mui/icons-material"; import { TabResults } from "./tabs"; import { getExplorerStatus } from "../../../utils/explorerStatus"; @@ -23,6 +23,7 @@ export default function ExplorerBox({ explorer, handleExplorerDeleteClick, onStatusChange, + isHighlighted = false, }) { const { t } = useTranslation(["datasets", "common"]); const theme = useTheme(); @@ -88,6 +89,20 @@ export default function ExplorerBox({ bgcolor: theme.palette.background.box, borderRadius: 2, height: "100%", + position: "relative", + zIndex: isHighlighted ? 1 : 0, + "@keyframes explorerHighlight": { + "0%": { + boxShadow: `0 0 0 3px ${alpha(theme.palette.primary.main, 0.65)}, 0 0 24px 8px ${alpha(theme.palette.primary.main, 0.2)}`, + }, + "65%": { + boxShadow: `0 0 0 1px ${alpha(theme.palette.primary.main, 0.15)}`, + }, + "100%": { boxShadow: "none" }, + }, + animation: isHighlighted + ? "explorerHighlight 4s ease-out forwards" + : "none", }} className="explorer-box" > diff --git a/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx b/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx index ea841b734..e2c54ef53 100644 --- a/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx +++ b/DashAI/front/src/components/notebooks/explorerCreation/FormExplorerSection.jsx @@ -32,8 +32,11 @@ export default function FormExplorerSection({ }, }); - const { explorersAndConverters, setExplorersAndConverters } = - useExplorersAndConverters(); + const { + explorersAndConverters, + setExplorersAndConverters, + setLastAddedItemId, + } = useExplorersAndConverters(); const { enqueueSnackbar } = useSnackbar(); const { t } = useTranslation(["datasets", "common"]); @@ -54,6 +57,7 @@ export default function FormExplorerSection({ ); const data = { ...created, type: "explorer" }; setExplorersAndConverters((prev) => [...prev, data]); + setLastAddedItemId(data.id); const response = await enqueueExplorerJob(created.id); diff --git a/DashAI/front/src/components/notebooks/notebook/NotebookView.jsx b/DashAI/front/src/components/notebooks/notebook/NotebookView.jsx index d1b951c7b..b7b9240f4 100644 --- a/DashAI/front/src/components/notebooks/notebook/NotebookView.jsx +++ b/DashAI/front/src/components/notebooks/notebook/NotebookView.jsx @@ -21,14 +21,10 @@ const RowItem = React.memo(function RowItem({ handleExplorerDeleteClick, handleConverterDeleteClick, handleStatusChange, + isHighlighted, }) { return ( - + {item.type === "explorer" ? ( handleStatusChange(id, newStatus, "explorer") } + isHighlighted={isHighlighted} /> ) : item.type === "converter" ? ( handleStatusChange(id, newStatus, "converter") } + isHighlighted={isHighlighted} /> ) : null} @@ -67,8 +65,12 @@ export default function NotebookView({ notebook }) { } }, [tourContext]); - const { explorersAndConverters, setExplorersAndConverters } = - useExplorersAndConverters(); + const { + explorersAndConverters, + setExplorersAndConverters, + lastAddedItemId, + clearLastAddedItemId, + } = useExplorersAndConverters(); const [openDeleteExplorerConfirmation, setOpenDeleteExplorerConfirmation] = useState(false); const [openDeleteConverterConfirmation, setOpenDeleteConverterConfirmation] = @@ -78,7 +80,10 @@ export default function NotebookView({ notebook }) { const [deleteModalContent, setDeleteModalContent] = useState(""); const [itemsToDelete, setItemsToDelete] = useState([]); const [listSize, setListSize] = useState(explorersAndConverters.length); + const [highlightedItemId, setHighlightedItemId] = useState(null); const listBoxRef = useRef(null); + const pendingHighlightRef = useRef(false); + const isProgrammaticScrollRef = useRef(false); const fetchExplorersAndConverters = useCallback(async () => { if (!notebook?.id) return; @@ -219,13 +224,61 @@ export default function NotebookView({ notebook }) { }; useEffect(() => { - scrollToBottom(); + if (!pendingHighlightRef.current) scrollToBottom(); }, [listSize]); useEffect(() => { setListSize(explorersAndConverters.length); }, [explorersAndConverters]); + // Smooth scroll to and highlight newly added item (blocks the instant listSize scroll) + useEffect(() => { + if (!lastAddedItemId) return; + pendingHighlightRef.current = true; + const scrollTimer = setTimeout(() => { + const index = explorersAndConverters.findIndex( + (item) => item.id === lastAddedItemId, + ); + if (listBoxRef.current && index >= 0) { + isProgrammaticScrollRef.current = true; + listBoxRef.current.scrollToIndex({ + index, + align: "end", + behavior: "smooth", + }); + setTimeout(() => { + isProgrammaticScrollRef.current = false; + }, 600); + } + setHighlightedItemId(lastAddedItemId); + clearLastAddedItemId(); + pendingHighlightRef.current = false; + }, 100); + const clearTimer = setTimeout(() => setHighlightedItemId(null), 4250); + return () => { + clearTimeout(scrollTimer); + clearTimeout(clearTimer); + }; + }, [lastAddedItemId]); + + const renderItem = useCallback( + (index, item) => ( + + ), + [ + handleExplorerDeleteClick, + handleConverterDeleteClick, + handleStatusChange, + highlightedItemId, + ], + ); + if (!notebook) { return ( 1 ? listSize - 1 : 0} data={explorersAndConverters} - itemContent={(index, item) => ( - - )} + itemContent={renderItem} + onScroll={() => { + if (!isProgrammaticScrollRef.current && highlightedItemId) { + setHighlightedItemId(null); + } + }} /> )} From c8d1d2d4dfa0cd37c8a7efb856df6df1b347aa72 Mon Sep 17 00:00:00 2001 From: Creylay Date: Fri, 8 May 2026 11:07:47 -0400 Subject: [PATCH 056/361] fix: replace Paper with Accordion for model comparison panel and improve resizing logic --- .../models/SessionVisualization.jsx | 273 +++++++++--------- 1 file changed, 133 insertions(+), 140 deletions(-) diff --git a/DashAI/front/src/components/models/SessionVisualization.jsx b/DashAI/front/src/components/models/SessionVisualization.jsx index 35ecf1783..d26066749 100644 --- a/DashAI/front/src/components/models/SessionVisualization.jsx +++ b/DashAI/front/src/components/models/SessionVisualization.jsx @@ -3,14 +3,14 @@ import { Box, Typography, Stack, - Paper, + Accordion, + AccordionSummary, + AccordionDetails, Divider, Button, ButtonGroup, ToggleButtonGroup, ToggleButton, - Collapse, - IconButton, Tooltip, } from "@mui/material"; import { @@ -18,7 +18,6 @@ import { TableChart, BarChart, ExpandMore, - ExpandLess, } from "@mui/icons-material"; import ModelComparisonTable from "./ModelComparisonTable"; import RunCard from "./RunCard"; @@ -157,12 +156,12 @@ export default function SessionVisualization() { const handleMouseMove = React.useCallback((e) => { if (isResizing.current) { - const container = document.querySelector("[data-session-viz]"); - if (container) { - const containerRect = container.getBoundingClientRect(); - const newHeight = e.clientY - containerRect.top; + const details = document.querySelector("[data-accordion-details]"); + if (details) { + const detailsRect = details.getBoundingClientRect(); + const newHeight = e.clientY - detailsRect.top; const minHeight = 150; - const maxHeight = containerRect.height * 0.8; + const maxHeight = window.innerHeight * 0.7; const clampedHeight = Math.max( minHeight, Math.min(maxHeight, newHeight), @@ -223,166 +222,162 @@ export default function SessionVisualization() { }} > {/* Sticky Comparison Table */} - setTableCollapsed((v) => !v)} + disableGutters + elevation={1} sx={{ - height: tableCollapsed ? "auto" : `${tableHeight}px`, flexShrink: 0, borderBottom: "1px solid", borderColor: "divider", - p: 2, - position: "relative", - display: "flex", - flexDirection: "column", + borderRadius: "4px", + "&:before": { display: "none" }, }} > - - + - setTableCollapsed((v) => !v)} - > - {tableCollapsed ? : } - + + } + sx={{ + "& .MuiAccordionSummary-content": { my: "8px", mr: 1 }, + }} + > + {t("models:label.modelComparison")} - - - {/* Metric Split Selector — controls both table and graph views */} - {(hasTrainMetrics || hasValidationMetrics || hasTestMetrics) && ( - { - if (newValue !== null) setMetricSplit(newValue); - }} - size="small" - > - {hasTrainMetrics && ( - - {t("common:train")} - - )} - {hasValidationMetrics && ( - - {t("common:validation")} - - )} - {hasTestMetrics && ( - {t("common:test")} - )} - - )} + e.stopPropagation()} + > + {/* Metric Split Selector — controls both table and graph views */} + {(hasTrainMetrics || + hasValidationMetrics || + hasTestMetrics) && ( + { + if (newValue !== null) setMetricSplit(newValue); + }} + size="small" + > + {hasTrainMetrics && ( + + {t("common:train")} + + )} + {hasValidationMetrics && ( + + {t("common:validation")} + + )} + {hasTestMetrics && ( + + {t("common:test")} + + )} + + )} - {/* Toggle between Table and Graphs */} - - - - + {/* Toggle between Table and Graphs */} + + + + - {/* Run All Button */} - {runs.length > 0 && - runs.some((r) => r.status === 0) && ( // Not Started + {/* Run All Button */} + {runs.length > 0 && runs.some((r) => r.status === 0) && ( )} + - - + + - - {runs.length === 0 ? ( - - - {t("models:label.noRunsYet")} - - - ) : ( - - {showTable ? ( - - ) : ( - - )} - - )} - - + {runs.length === 0 ? ( + + + {t("models:label.noRunsYet")} + + + ) : ( + + {showTable ? ( + + ) : ( + + )} + + )} - {/* Resize Handle */} - {!tableCollapsed && ( + {/* Resize Handle */} { isResizing.current = true; @@ -391,21 +386,19 @@ export default function SessionVisualization() { }} sx={{ position: "absolute", - bottom: -2, + bottom: 0, left: 0, right: 0, height: "5px", cursor: "row-resize", bgcolor: "transparent", transition: "background-color 0.2s ease", - "&:hover": { - bgcolor: "primary.main", - }, + "&:hover": { bgcolor: "primary.main" }, zIndex: 10, }} /> - )} - + + From 3103167e9a47d060408e74277f4413d480411b23 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Fri, 8 May 2026 11:13:22 -0400 Subject: [PATCH 057/361] fix: resolve WebSocket base URL construction in production build REACT_APP_API_URL is /api (relative) in production, which is invalid as a new URL() base. Extract origin via new URL(env, window.location.origin).origin so relative env values fall back to page origin while absolute dev URLs (http://localhost:8000/api) correctly yield port 8000. --- DashAI/front/src/components/models/LiveMetricsChart.jsx | 9 ++++++--- DashAI/front/src/hooks/useHardwareMonitor.js | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/DashAI/front/src/components/models/LiveMetricsChart.jsx b/DashAI/front/src/components/models/LiveMetricsChart.jsx index c779f0a60..014463571 100644 --- a/DashAI/front/src/components/models/LiveMetricsChart.jsx +++ b/DashAI/front/src/components/models/LiveMetricsChart.jsx @@ -86,12 +86,15 @@ export function LiveMetricsChart({ run }) { }; } - const apiUrl = process.env.REACT_APP_API_URL || `${window.location.origin}`; + const wsOrigin = new URL( + process.env.REACT_APP_API_URL || "/", + window.location.origin, + ).origin; let wsUrl; try { - wsUrl = new URL(`/api/v1/metrics/ws/${run.id}`, apiUrl); + wsUrl = new URL(`/api/v1/metrics/ws/${run.id}`, wsOrigin); } catch (e) { - console.error("Invalid WebSocket base URL:", apiUrl, e); + console.error("Invalid WebSocket base URL:", wsOrigin, e); return; } if (wsUrl.protocol === "http:") { diff --git a/DashAI/front/src/hooks/useHardwareMonitor.js b/DashAI/front/src/hooks/useHardwareMonitor.js index 1828c3b96..a26eb821f 100644 --- a/DashAI/front/src/hooks/useHardwareMonitor.js +++ b/DashAI/front/src/hooks/useHardwareMonitor.js @@ -43,12 +43,15 @@ export function useHardwareMonitor(enabled) { clearReconnectTimeout(); - const apiUrl = process.env.REACT_APP_API_URL || `${window.location.origin}`; + const wsOrigin = new URL( + process.env.REACT_APP_API_URL || "/", + window.location.origin, + ).origin; let wsUrl; try { - wsUrl = new URL("api/v1/hardware/ws", apiUrl); + wsUrl = new URL("/api/v1/hardware/ws", wsOrigin); } catch (e) { - console.error("Invalid WebSocket base URL:", apiUrl); + console.error("Invalid WebSocket base URL:", wsOrigin); return; } From 0b083df3ffaf2b7b10e1d30584eb5c0daedd8f76 Mon Sep 17 00:00:00 2001 From: Creylay Date: Fri, 8 May 2026 13:52:34 -0400 Subject: [PATCH 058/361] feat: enhance dataset export functionality with filtering and sorting options --- DashAI/back/api/api_v1/endpoints/datasets.py | 371 ++++++------------ DashAI/front/src/api/datasets.ts | 15 +- .../notebooks/dataset/DatasetTable.jsx | 57 +-- 3 files changed, 159 insertions(+), 284 deletions(-) diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 185c009dc..c8955c8e8 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -4,7 +4,7 @@ import zipfile from collections import OrderedDict from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, Dict +from typing import TYPE_CHECKING, Any, Dict, Optional import pyarrow as pa from fastapi import APIRouter, Depends, File, Form, Query, Response, UploadFile, status @@ -1528,159 +1528,149 @@ async def get_dataset_file( return JSONResponse(content={"rows": rows, "total": total_rows}) +def _build_export_response(table: "pa.Table", dataset_name: str) -> StreamingResponse: + """Build a StreamingResponse for dataset export. + + For image datasets (struct columns), produces a ZIP in ImageFolder format: + `` ) : ( - + Date: Fri, 8 May 2026 14:37:47 -0400 Subject: [PATCH 061/361] feat: add HubDownload model, job, and API endpoints - HubDownloadStatus enum (downloading/ready/error) - HubDownload DB model with unique constraint on (source_name, dataset_id) - Alembic migration for hub_download table - HubDownloadJob: fetches file via fetch_full, stores under hub_downloads/{id}/ - /v1/hub-download CRUD + /files listing endpoint - Idempotent POST: returns existing record if ready/downloading, retries on error --- .../a1c3e5f7b9d2_add_hub_download_table.py | 48 ++++ DashAI/back/api/api_v1/api.py | 4 +- .../back/api/api_v1/endpoints/hub_download.py | 213 ++++++++++++++++++ DashAI/back/core/enums/status.py | 6 + DashAI/back/dependencies/database/models.py | 32 +++ DashAI/back/job/hub_download_job.py | 109 +++++++++ 6 files changed, 411 insertions(+), 1 deletion(-) create mode 100644 DashAI/alembic/versions/a1c3e5f7b9d2_add_hub_download_table.py create mode 100644 DashAI/back/api/api_v1/endpoints/hub_download.py create mode 100644 DashAI/back/job/hub_download_job.py diff --git a/DashAI/alembic/versions/a1c3e5f7b9d2_add_hub_download_table.py b/DashAI/alembic/versions/a1c3e5f7b9d2_add_hub_download_table.py new file mode 100644 index 000000000..625b7c301 --- /dev/null +++ b/DashAI/alembic/versions/a1c3e5f7b9d2_add_hub_download_table.py @@ -0,0 +1,48 @@ +"""Add hub_download table + +Revision ID: a1c3e5f7b9d2 +Revises: b4f9e70098e7 +Create Date: 2026-05-08 00:00:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "a1c3e5f7b9d2" +down_revision: Union[str, None] = "b4f9e70098e7" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "hub_download", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("source_name", sa.String(), nullable=False), + sa.Column("dataset_id", sa.String(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("local_path", sa.String(), nullable=True), + sa.Column( + "status", + sa.Enum("downloading", "ready", "error", name="hubdownloadstatus"), + nullable=False, + ), + sa.Column("error_message", sa.String(), nullable=True), + sa.Column("created", sa.DateTime(), nullable=True), + sa.Column("last_modified", sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint("id", name=op.f("pk_hub_download")), + sa.UniqueConstraint( + "source_name", + "dataset_id", + name="uq_hub_download_source_dataset", + ), + ) + + +def downgrade() -> None: + op.drop_table("hub_download") + op.execute("DROP TYPE IF EXISTS hubdownloadstatus") diff --git a/DashAI/back/api/api_v1/api.py b/DashAI/back/api/api_v1/api.py index 897fe5b7e..0695c38de 100644 --- a/DashAI/back/api/api_v1/api.py +++ b/DashAI/back/api/api_v1/api.py @@ -2,6 +2,7 @@ from DashAI.back.api.api_v1.endpoints.components import router as components from DashAI.back.api.api_v1.endpoints.converters import router as converters +from DashAI.back.api.api_v1.endpoints.dataset_source import router as dataset_source from DashAI.back.api.api_v1.endpoints.datasets import router as datasets from DashAI.back.api.api_v1.endpoints.explainers import router as explainers from DashAI.back.api.api_v1.endpoints.explorers import router as explorers @@ -12,6 +13,7 @@ router as generative_session, ) from DashAI.back.api.api_v1.endpoints.hardware import router as hardware +from DashAI.back.api.api_v1.endpoints.hub_download import router as hub_download from DashAI.back.api.api_v1.endpoints.jobs import router as jobs from DashAI.back.api.api_v1.endpoints.metrics import router as metrics from DashAI.back.api.api_v1.endpoints.model_sessions import router as model_sessions @@ -20,7 +22,6 @@ from DashAI.back.api.api_v1.endpoints.plugins import router as plugins from DashAI.back.api.api_v1.endpoints.predict import router as predict from DashAI.back.api.api_v1.endpoints.runs import router as runs -from DashAI.back.api.api_v1.endpoints.dataset_source import router as dataset_source from DashAI.back.api.api_v1.endpoints.scoring import router as scoring api_router_v1 = APIRouter() @@ -42,3 +43,4 @@ api_router_v1.include_router(hardware, prefix="/hardware") api_router_v1.include_router(scoring, prefix="/scoring") api_router_v1.include_router(dataset_source, prefix="/dataset-source") +api_router_v1.include_router(hub_download, prefix="/hub-download") diff --git a/DashAI/back/api/api_v1/endpoints/hub_download.py b/DashAI/back/api/api_v1/endpoints/hub_download.py new file mode 100644 index 000000000..8f9fefb13 --- /dev/null +++ b/DashAI/back/api/api_v1/endpoints/hub_download.py @@ -0,0 +1,213 @@ +"""Hub download management endpoints.""" + +import logging +import os +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List + +from fastapi import APIRouter, Depends, status +from fastapi.exceptions import HTTPException +from kink import di +from pydantic import BaseModel +from sqlalchemy import exc + +from DashAI.back.core.enums.status import HubDownloadStatus +from DashAI.back.dependencies.database.models import HubDownload + +if TYPE_CHECKING: + from sqlalchemy.orm import sessionmaker + + from DashAI.back.dependencies.registry import ComponentRegistry + +log = logging.getLogger(__name__) +router = APIRouter() + + +def _row_to_dict(row: HubDownload) -> Dict[str, Any]: + return { + "id": row.id, + "source_name": row.source_name, + "dataset_id": row.dataset_id, + "name": row.name, + "local_path": row.local_path, + "status": row.status.value, + "error_message": row.error_message, + "created": row.created.isoformat() if row.created else None, + "last_modified": row.last_modified.isoformat() if row.last_modified else None, + } + + +class CreateDownloadRequest(BaseModel): + source_name: str + dataset_id: str + name: str + + +@router.get("/", response_model=List[Dict[str, Any]]) +async def list_downloads( + session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), +) -> List[Dict[str, Any]]: + """Return all hub download records.""" + with session_factory() as db: + rows = db.query(HubDownload).order_by(HubDownload.created.desc()).all() + return [_row_to_dict(r) for r in rows] + + +@router.post("/", status_code=status.HTTP_201_CREATED, response_model=Dict[str, Any]) +async def create_download( + body: CreateDownloadRequest, + session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), + registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), + job_queue=Depends(lambda: di["job_queue"]), +) -> Dict[str, Any]: + """Create a HubDownload record and enqueue the download job. + + If a record for (source_name, dataset_id) already exists and its status is + READY, it is returned immediately without re-downloading. + """ + from DashAI.back.job.hub_download_job import HubDownloadJob + + sources = registry._registry.get("DatasetSource", {}) + if body.source_name not in sources: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"DatasetSource '{body.source_name}' not found.", + ) + + with session_factory() as db: + existing = ( + db.query(HubDownload) + .filter( + HubDownload.source_name == body.source_name, + HubDownload.dataset_id == body.dataset_id, + ) + .first() + ) + if existing is not None: + if existing.status == HubDownloadStatus.READY: + return _row_to_dict(existing) + if existing.status == HubDownloadStatus.DOWNLOADING: + return _row_to_dict(existing) + # ERROR — allow retry: reset to downloading + existing.status = HubDownloadStatus.DOWNLOADING + existing.error_message = None + existing.local_path = None + existing.name = body.name + try: + db.commit() + db.refresh(existing) + except exc.SQLAlchemyError as e: + log.exception(e) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="DB error resetting download.", + ) from e + row = existing + else: + row = HubDownload( + source_name=body.source_name, + dataset_id=body.dataset_id, + name=body.name, + status=HubDownloadStatus.DOWNLOADING, + ) + db.add(row) + try: + db.commit() + db.refresh(row) + except exc.SQLAlchemyError as e: + log.exception(e) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="DB error creating download record.", + ) from e + + hub_download_id = row.id + result_dict = _row_to_dict(row) + + job = HubDownloadJob( + kwargs={ + "hub_download_id": hub_download_id, + "source_name": body.source_name, + "dataset_source_id": body.dataset_id, + } + ) + job_result = job_queue.put(job) + job_id = getattr(job_result, "id", job_result) + result_dict["job_id"] = job_id + return result_dict + + +@router.get("/{hub_download_id}", response_model=Dict[str, Any]) +async def get_download( + hub_download_id: int, + session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), +) -> Dict[str, Any]: + """Return a single hub download record by id.""" + with session_factory() as db: + row = db.get(HubDownload, hub_download_id) + if row is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"HubDownload {hub_download_id} not found.", + ) + return _row_to_dict(row) + + +@router.delete("/{hub_download_id}", status_code=status.HTTP_204_NO_CONTENT) +async def delete_download( + hub_download_id: int, + session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), +) -> None: + """Delete a hub download record and its cached files.""" + import shutil + + with session_factory() as db: + row = db.get(HubDownload, hub_download_id) + if row is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"HubDownload {hub_download_id} not found.", + ) + local_path = row.local_path + try: + db.delete(row) + db.commit() + except exc.SQLAlchemyError as e: + log.exception(e) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="DB error deleting download record.", + ) from e + + if local_path and os.path.exists(local_path): + shutil.rmtree(local_path, ignore_errors=True) + + +@router.get("/{hub_download_id}/files", response_model=List[str]) +async def list_files( + hub_download_id: int, + session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), +) -> List[str]: + """Return the list of files in a ready hub download directory.""" + with session_factory() as db: + row = db.get(HubDownload, hub_download_id) + if row is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"HubDownload {hub_download_id} not found.", + ) + if row.status != HubDownloadStatus.READY or not row.local_path: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail="Download is not ready yet.", + ) + local_path = row.local_path + + path = Path(local_path) + if not path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Download directory not found on disk.", + ) + files = sorted(str(p.relative_to(path)) for p in path.rglob("*") if p.is_file()) + return files diff --git a/DashAI/back/core/enums/status.py b/DashAI/back/core/enums/status.py index b66e70fd2..5fa493058 100644 --- a/DashAI/back/core/enums/status.py +++ b/DashAI/back/core/enums/status.py @@ -53,3 +53,9 @@ class PredictionStatus(Enum): STARTED = 2 FINISHED = 3 ERROR = 4 + + +class HubDownloadStatus(Enum): + DOWNLOADING = "downloading" + READY = "ready" + ERROR = "error" diff --git a/DashAI/back/dependencies/database/models.py b/DashAI/back/dependencies/database/models.py index d301c716f..476b56e81 100644 --- a/DashAI/back/dependencies/database/models.py +++ b/DashAI/back/dependencies/database/models.py @@ -13,6 +13,7 @@ Integer, MetaData, String, + UniqueConstraint, ) from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Mapped, mapped_column, relationship @@ -24,6 +25,7 @@ DatasetStatus, ExplainerStatus, ExplorerStatus, + HubDownloadStatus, PluginStatus, PredictionStatus, RunStatus, @@ -716,3 +718,33 @@ def delete_result(self) -> None: self.delivery_time = None self.start_time = None self.end_time = None + + +class HubDownload(Base): + __tablename__ = "hub_download" + + id: Mapped[int] = mapped_column(primary_key=True) + source_name: Mapped[str] = mapped_column(String, nullable=False) + dataset_id: Mapped[str] = mapped_column(String, nullable=False) + name: Mapped[str] = mapped_column(String, nullable=False) + local_path: Mapped[str] = mapped_column(String, nullable=True) + status: Mapped[Enum] = mapped_column( + Enum(HubDownloadStatus), + nullable=False, + default=HubDownloadStatus.DOWNLOADING, + ) + error_message: Mapped[str] = mapped_column(String, nullable=True) + created: Mapped[DateTime] = mapped_column(DateTime, default=datetime.now) + last_modified: Mapped[DateTime] = mapped_column( + DateTime, + default=datetime.now, + onupdate=datetime.now, + ) + + __table_args__ = ( + UniqueConstraint( + "source_name", + "dataset_id", + name="uq_hub_download_source_dataset", + ), + ) diff --git a/DashAI/back/job/hub_download_job.py b/DashAI/back/job/hub_download_job.py new file mode 100644 index 000000000..c87929e0d --- /dev/null +++ b/DashAI/back/job/hub_download_job.py @@ -0,0 +1,109 @@ +"""Job for downloading a dataset from an external hub source.""" + +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +from kink import di, inject +from sqlalchemy import exc + +from DashAI.back.core.enums.status import HubDownloadStatus +from DashAI.back.dependencies.database.models import HubDownload +from DashAI.back.job.base_job import BaseJob, JobError + +if TYPE_CHECKING: + from sqlalchemy.orm import sessionmaker + +log = logging.getLogger(__name__) + + +class HubDownloadJob(BaseJob): + """Job that fetches a dataset file from an external hub source. + + Parameters + ---------- + kwargs : dict + - hub_download_id: int — DB row id + - source_name: str — DatasetSource class name + - dataset_source_id: str — source-specific dataset identifier + """ + + @inject + def set_status_as_delivered( + self, session_factory: "sessionmaker" = lambda di: di["session_factory"] + ) -> None: + """No-op: hub downloads don't use the delivered state.""" + + @inject + def set_status_as_error( + self, session_factory: "sessionmaker" = lambda di: di["session_factory"] + ) -> None: + hub_download_id: int = self.kwargs["hub_download_id"] + error_msg: str = self.kwargs.get("_error_message", "") + with session_factory() as db: + row: HubDownload = db.get(HubDownload, hub_download_id) + if row is not None: + row.status = HubDownloadStatus.ERROR + row.error_message = error_msg + try: + db.commit() + except exc.SQLAlchemyError as e: + log.exception(e) + + def get_job_name(self) -> str: + return f"Hub download: {self.kwargs.get('dataset_source_id', '')}" + + @inject + def run(self) -> None: + import shutil + + component_registry = di["component_registry"] + session_factory = di["session_factory"] + config = di["config"] + + hub_download_id: int = self.kwargs["hub_download_id"] + source_name: str = self.kwargs["source_name"] + dataset_source_id: str = self.kwargs["dataset_source_id"] + + hub_downloads_base: Path = config["LOCAL_PATH"] / "hub_downloads" + download_dir: Path = hub_downloads_base / str(hub_download_id) + + try: + sources = component_registry._registry.get("DatasetSource", {}) + if source_name not in sources: + raise JobError(f"DatasetSource '{source_name}' not found in registry.") + + download_dir.mkdir(parents=True, exist_ok=True) + source = sources[source_name]["class"]() + file_path, _ = source.fetch_full(dataset_source_id, str(download_dir)) + log.debug("Hub dataset '%s' downloaded to %s", dataset_source_id, file_path) + + with session_factory() as db: + row: HubDownload = db.get(HubDownload, hub_download_id) + if row is None: + raise JobError(f"HubDownload row {hub_download_id} not found.") + row.status = HubDownloadStatus.READY + row.local_path = str(download_dir) + try: + db.commit() + except exc.SQLAlchemyError as e: + log.exception(e) + raise JobError("DB error saving download path.") from e + + log.debug("Hub download job %d completed.", hub_download_id) + + except JobError as e: + log.error("Hub download job %d failed: %s", hub_download_id, e) + self.kwargs["_error_message"] = str(e) + with session_factory() as db: + row = db.get(HubDownload, hub_download_id) + if row is not None: + row.status = HubDownloadStatus.ERROR + row.error_message = str(e) + try: + db.commit() + except exc.SQLAlchemyError as db_err: + log.exception(db_err) + if download_dir.exists(): + shutil.rmtree(download_dir, ignore_errors=True) + raise From cbc7e47f688d4ba7c7eb33061494e6309d5393af Mon Sep 17 00:00:00 2001 From: Irozuku Date: Fri, 8 May 2026 14:37:54 -0400 Subject: [PATCH 062/361] feat: use cached hub download in DatasetJob when hub_download_id provided Skips re-downloading when hub_download_id + selected_file are passed in params; temp_dir remains None so the cached directory is not cleaned up. --- DashAI/back/job/dataset_job.py | 56 ++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 8b6855155..410889867 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -168,19 +168,54 @@ def run( # --- Hub import path --- import tempfile - hub_temp = tempfile.mkdtemp() - temp_dir = hub_temp # ensures finally block cleans it up + from DashAI.back.core.enums.status import HubDownloadStatus + from DashAI.back.dependencies.database.models import ( + HubDownload, + ) dataset_source_id = self.kwargs.get("dataset_source_id", "") + hub_download_id = params.get("hub_download_id") + selected_file = params.get("selected_file") + sources = component_registry._registry.get("DatasetSource", {}) if source_name not in sources: raise JobError( f"DatasetSource '{source_name}' not found in registry." ) source = sources[source_name]["class"]() - file_path_hub, source_dataloader_name = source.fetch_full( - dataset_source_id, hub_temp - ) + + if hub_download_id is not None: + # Use pre-downloaded cached file — do not clean up temp_dir + with session_factory() as db: + hub_row = db.get(HubDownload, hub_download_id) + if ( + hub_row is None + or hub_row.status != HubDownloadStatus.READY + ): + raise JobError( + f"HubDownload {hub_download_id} is not ready." + ) + hub_work_dir = hub_row.local_path + if selected_file: + file_path_hub = str(Path(hub_work_dir) / selected_file) + else: + files = sorted( + str(p) + for p in Path(hub_work_dir).rglob("*") + if p.is_file() + ) + if not files: + raise JobError("Hub download directory is empty.") + file_path_hub = files[0] + source_dataloader_name = params.get("dataloader", "") + else: + hub_temp = tempfile.mkdtemp() + temp_dir = hub_temp # ensures finally block cleans it up + hub_work_dir = hub_temp + file_path_hub, source_dataloader_name = source.fetch_full( + dataset_source_id, hub_temp + ) + selected_dataloader = ( params.get("dataloader") or source_dataloader_name ) @@ -191,12 +226,15 @@ def run( and selected_dataloader not in compatible ): raise JobError( - "Selected DataLoader is not compatible with this source." + "Selected DataLoader is not compatible" + " with this source." ) - dl_registry = component_registry._registry.get("DataLoader", {}) + _reg = component_registry._registry + dl_registry = _reg.get("DataLoader", {}) if selected_dataloader not in dl_registry: raise JobError( - f"DataLoader '{selected_dataloader}' not found in registry." + f"DataLoader '{selected_dataloader}'" + " not found in registry." ) dataloader = dl_registry[selected_dataloader]["class"]() log.debug( @@ -207,7 +245,7 @@ def run( hub_loader_params = params.get("dataloader_params", {}) new_dataset = dataloader.load_data( filepath_or_buffer=file_path_hub, - temp_path=hub_temp, + temp_path=hub_work_dir, params=hub_loader_params, n_sample=None, ) From d6964365d5d1d18f9a7898935ac1c30166cabcb7 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Fri, 8 May 2026 14:38:12 -0400 Subject: [PATCH 063/361] feat: hub download-first workflow, downloads sidebar, file selector step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - hub.ts: HubDownload types + listHubDownloads/getHubDownload/createHubDownload/deleteHubDownload/listHubDownloadFiles - DatasetDetail: button state machine (download → downloading → add to DashAI / error+retry) - HubLeftBar: downloaded datasets section with status, delete, and add-to-dashai actions - HubContent: manages downloads map, polls in-progress every 3s, wires all handlers - HubImportPanel: optional file-selector step 0 when hubDownload prop is provided - i18n: new hub keys for download flow and file selector (en + es) --- DashAI/front/src/api/hub.ts | 53 +++++++ .../src/components/hub/DatasetDetail.jsx | 133 ++++++++++++---- .../src/components/hub/HubImportPanel.jsx | 144 ++++++++++++++++-- .../front/src/components/hub/HubLeftBar.jsx | 88 ++++++++++- DashAI/front/src/pages/hub/HubContent.jsx | 136 ++++++++++++++++- .../front/src/utils/i18n/locales/en/hub.json | 16 +- .../front/src/utils/i18n/locales/es/hub.json | 16 +- 7 files changed, 534 insertions(+), 52 deletions(-) diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index fabf674a9..42326f47b 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -119,3 +119,56 @@ export const getComponentInfo = async ( ); return response.data; }; + +// ---- Hub Downloads ---- + +const hubDownloadEndpoint = "/v1/hub-download"; + +export type HubDownloadStatus = "downloading" | "ready" | "error"; + +export interface HubDownload { + id: number; + source_name: string; + dataset_id: string; + name: string; + local_path: string | null; + status: HubDownloadStatus; + error_message: string | null; + created: string | null; + last_modified: string | null; + job_id?: string; +} + +export const listHubDownloads = async (): Promise => { + const response = await api.get(`${hubDownloadEndpoint}/`); + return response.data; +}; + +export const getHubDownload = async (id: number): Promise => { + const response = await api.get(`${hubDownloadEndpoint}/${id}`); + return response.data; +}; + +export const createHubDownload = async ( + source_name: string, + dataset_id: string, + name: string, +): Promise => { + const response = await api.post(`${hubDownloadEndpoint}/`, { + source_name, + dataset_id, + name, + }); + return response.data; +}; + +export const deleteHubDownload = async (id: number): Promise => { + await api.delete(`${hubDownloadEndpoint}/${id}`); +}; + +export const listHubDownloadFiles = async (id: number): Promise => { + const response = await api.get( + `${hubDownloadEndpoint}/${id}/files`, + ); + return response.data; +}; diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index fd3e103d7..03a9e6763 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -3,26 +3,40 @@ import { Box, Button, Chip, + CircularProgress, Divider, Link, Stack, + Tooltip, Typography, } from "@mui/material"; import OpenInNewIcon from "@mui/icons-material/OpenInNew"; -import DownloadIcon from "@mui/icons-material/Download"; +import CloudDownloadIcon from "@mui/icons-material/CloudDownload"; +import CheckCircleIcon from "@mui/icons-material/CheckCircle"; +import ErrorIcon from "@mui/icons-material/Error"; import AddIcon from "@mui/icons-material/Add"; import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; -import { getDatasetInfo, getDownloadUrl } from "../../api/hub"; +import { getDatasetInfo } from "../../api/hub"; /** * Right panel — detailed view of a selected Hub dataset with action buttons. * * @param {object|null} dataset - Selected DatasetEntry, or null if none. * @param {string|null} sourceName - Active DatasetSource class name. - * @param {function} onStartImport - Called when user clicks "Add to DashAI". + * @param {object|null} download - HubDownload record for this dataset (if any). + * @param {boolean} downloadLoading - True while the download record is being created. + * @param {function} onStartDownload - Called when user clicks "Download to DashAI". + * @param {function} onStartImport - Called when download is ready and user clicks "Add to DashAI". */ -export default function DatasetDetail({ dataset, sourceName, onStartImport }) { +export default function DatasetDetail({ + dataset, + sourceName, + download = null, + downloadLoading = false, + onStartDownload, + onStartImport, +}) { const { t } = useTranslation(["hub"]); const theme = useTheme(); const [extraInfo, setExtraInfo] = useState(null); @@ -57,13 +71,84 @@ export default function DatasetDetail({ dataset, sourceName, onStartImport }) { ); } - const handleDownload = async () => { - try { - const url = await getDownloadUrl(sourceName, dataset.id); - window.location.href = url; - } catch { - // silently fail — source page link is still available + const renderActionButton = () => { + if (downloadLoading) { + return ( + + ); + } + + if (!download) { + return ( + + ); + } + + if (download.status === "downloading") { + return ( + + ); + } + + if (download.status === "error") { + return ( + + + + + + + ); } + + // READY + return ( + + ); }; return ( @@ -87,23 +172,17 @@ export default function DatasetDetail({ dataset, sourceName, onStartImport }) { {dataset.name} - - - + + {renderActionButton()} + {download?.status === "ready" && ( + } + label={t("hub:downloaded")} + size="small" + color="success" + variant="outlined" + /> + )} { if (!dataset || !sourceName) return; setStepValue(0); @@ -67,8 +91,33 @@ export default function HubImportPanel({ setPreviewError(false); setColumnTypes({}); setColumnRenames({}); + setSelectedFile(null); + setFiles([]); }, [dataset?.id, sourceName, onSelectedLoaderChange, setStepValue]); + // Load files when entering file-select step + useEffect(() => { + if (!hasFileStep || stepValue !== 0 || !hubDownload) return; + let isMounted = true; + setLoadingFiles(true); + listHubDownloadFiles(hubDownload.id) + .then((f) => { + if (!isMounted) return; + setFiles(f); + if (f.length === 1) setSelectedFile(f[0]); + }) + .catch(() => { + if (isMounted) setFiles([]); + }) + .finally(() => { + if (isMounted) setLoadingFiles(false); + }); + return () => { + isMounted = false; + }; + }, [hasFileStep, stepValue, hubDownload?.id]); + + // Load dataloader metadata useEffect(() => { if (!compatibleComponents.length) { setDataloaders([]); @@ -100,13 +149,12 @@ export default function HubImportPanel({ }; }, [compatibleComponents]); + // Preview useEffect(() => { - if (stepValue !== 1 || !dataset || !sourceName) return; + if (stepValue !== previewStep || !dataset || !sourceName) return; let isMounted = true; - if (previewDebounceRef.current) { - clearTimeout(previewDebounceRef.current); - } + if (previewDebounceRef.current) clearTimeout(previewDebounceRef.current); setPreviewData(null); setPreviewLoading(true); @@ -118,7 +166,13 @@ export default function HubImportPanel({ : 100; previewDebounceRef.current = setTimeout(() => { - previewHubDataset(sourceName, dataset.id, effectiveRows, selectedValue?.name, formValues) + previewHubDataset( + sourceName, + dataset.id, + effectiveRows, + selectedValue?.name, + formValues, + ) .then((data) => { if (!isMounted) return; setPreviewData(data); @@ -134,12 +188,11 @@ export default function HubImportPanel({ return () => { isMounted = false; - if (previewDebounceRef.current) { - clearTimeout(previewDebounceRef.current); - } + if (previewDebounceRef.current) clearTimeout(previewDebounceRef.current); }; }, [ stepValue, + previewStep, dataset?.id, sourceName, selectedValue?.name, @@ -155,12 +208,17 @@ export default function HubImportPanel({ setImporting(true); try { const created = await createDataset(name.trim()); - await importHubDataset(sourceName, dataset.id, created.id, { + const importParams = { dataloader: selectedValue.name, dataloader_params: formValues, inferred_types: columnTypes, column_renames: columnRenames, - }); + }; + if (hubDownload) { + importParams.hub_download_id = hubDownload.id; + if (selectedFile) importParams.selected_file = selectedFile; + } + await importHubDataset(sourceName, dataset.id, created.id, importParams); enqueueSnackbar(t("hub:importSuccess"), { variant: "success" }); onImported?.(); } catch { @@ -170,7 +228,10 @@ export default function HubImportPanel({ } }; - const canProceed = !!selectedValue?.name; + const canProceedFromFile = + hasFileStep && stepValue === 0 ? !!selectedFile : true; + const canProceed = + stepValue === dataloaderStep ? !!selectedValue?.name : canProceedFromFile; const canImport = !!selectedValue?.name && !!name.trim() && @@ -180,6 +241,9 @@ export default function HubImportPanel({ !formHasErrors && !importing; + const handleBack = () => setStepValue((s) => s - 1); + const handleNext = () => setStepValue((s) => s + 1); + return ( - {stepValue === 0 && ( + {/* Step 0 (with hubDownload): file selector */} + {hasFileStep && stepValue === 0 && ( + + + + {t("hub:stepFileTitle")} + + + {t("hub:stepFileSubtitle")} + + + + {loadingFiles ? ( + + + + ) : files.length === 0 ? ( + + {t("hub:noFilesFound")} + + ) : ( + + {files.map((f) => ( + setSelectedFile(f)} + sx={{ borderRadius: 1, mb: 0.5 }} + > + + + + ))} + + )} + + )} + + {/* Dataloader step */} + {stepValue === dataloaderStep && ( )} - {stepValue === 1 && ( + {/* Preview step */} + {stepValue === previewStep && ( @@ -329,15 +441,15 @@ export default function HubImportPanel({ {t("common:cancel")} ) : ( - )} - {stepValue === 0 ? ( + {stepValue < previewStep ? ( + + )} + { - getDatasetSources() - .then(setSources) - .catch(() => setSources([])) - .finally(() => setLoading(false)); - }, []); + const filteredDownloads = downloads.filter((dl) => + dl.name.toLowerCase().includes(searchQuery.toLowerCase()), + ); return ( - + {t("hub:title")} - - - - {/* Sources section */} - - {loading ? ( - - - - ) : ( - <> - setSourcesOpen((v) => !v)} - /> - - - {sources.length === 0 ? ( - - {t("common:noItemsAvailable", "No items available.")} - - ) : ( - sources.map((source) => ( - onSelectSource(source)} - > - - {source.display_name || source.name} - - - {source.description} - - - )) - )} - - - - )} - + + setSearchQuery(e.target.value)} + /> + - + - {/* Downloads section */} - + + setDownloadsOpen((v) => !v)} /> - {downloads.length === 0 ? ( + {filteredDownloads.length === 0 ? ( ) : ( - downloads.map((dl) => ( + filteredDownloads.map((dl) => ( HubDownload record const [downloads, setDownloads] = useState({}); const [downloadLoading, setDownloadLoading] = useState(false); - // download used in the current import flow (from left-bar "Add" button) const [importDownload, setImportDownload] = useState(null); const pollTimerRef = useRef(null); const sourceName = selectedSource?.name ?? null; - // Load all existing downloads on mount + useEffect(() => { + getDatasetSources() + .then(setSources) + .catch(() => setSources([])) + .finally(() => setSourcesLoading(false)); + }, []); + useEffect(() => { listHubDownloads() .then((rows) => { @@ -52,7 +71,6 @@ export default function HubContent() { .catch(() => {}); }, []); - // Poll in-progress downloads useEffect(() => { const inProgress = Object.values(downloads).filter( (d) => d.status === "downloading", @@ -174,13 +192,18 @@ export default function HubContent() { const downloadsList = Object.values(downloads); + const sourceOptions = sources.map((source) => ({ + name: source.name, + display_name: source.display_name || source.name, + description: source.description || "", + Icon: SOURCE_ICONS[source.name] ?? CloudDownloadIcon, + })); + return ( - ) : ( + ) : selectedSource ? ( setSelectedSource(null)} + /> + ) : ( + + handleSelectSource(sources.find((s) => s.name === name)) + } /> )} diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json index 18803064a..ca9bcce63 100644 --- a/DashAI/front/src/utils/i18n/locales/en/hub.json +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -50,5 +50,7 @@ "stepFileSubtitle": "Choose the file to import from the downloaded dataset.", "noFilesFound": "No files found in this download.", "sources": "Sources", - "fromSource": "From {{source}}" + "fromSource": "From {{source}}", + "selectSourceSubtitle": "Select a source to browse and import datasets.", + "searchDownloads": "Search downloads..." } diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json index abe9af705..a72f4f8a4 100644 --- a/DashAI/front/src/utils/i18n/locales/es/hub.json +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -50,5 +50,7 @@ "stepFileSubtitle": "Elige el archivo a importar del dataset descargado.", "noFilesFound": "No se encontraron archivos en esta descarga.", "sources": "Fuentes", - "fromSource": "De {{source}}" + "fromSource": "De {{source}}", + "selectSourceSubtitle": "Selecciona una fuente para explorar e importar datasets.", + "searchDownloads": "Buscar descargas..." } From 24219fdda86c601ad331a6f6a52890e2d87dc36f Mon Sep 17 00:00:00 2001 From: Irozuku Date: Fri, 8 May 2026 15:33:27 -0400 Subject: [PATCH 068/361] feat: add routes and breadcrumbs to hub source/dataset navigation - Add /app/hub/:sourceName route alongside /app/hub in App.jsx - HubContent: derive selectedSource from URL param via useParams; source card click navigates to /app/hub/:sourceName via useNavigate - New HubBreadcrumbs.jsx: shows "Dataset Hub > {source name}" with back arrow icon button, matching ModelsBreadcrumbs pattern - DatasetGrid: replace onBack Button with HubBreadcrumbs, swap onBack prop for sourceDisplayName --- DashAI/front/src/App.jsx | 1 + .../front/src/components/hub/DatasetGrid.jsx | 21 ++----- .../src/components/hub/HubBreadcrumbs.jsx | 48 ++++++++++++++ DashAI/front/src/pages/hub/HubContent.jsx | 62 +++++++++++-------- 4 files changed, 89 insertions(+), 43 deletions(-) create mode 100644 DashAI/front/src/components/hub/HubBreadcrumbs.jsx diff --git a/DashAI/front/src/App.jsx b/DashAI/front/src/App.jsx index c43937f95..77b441653 100644 --- a/DashAI/front/src/App.jsx +++ b/DashAI/front/src/App.jsx @@ -63,6 +63,7 @@ function App() { } /> + } /> diff --git a/DashAI/front/src/components/hub/DatasetGrid.jsx b/DashAI/front/src/components/hub/DatasetGrid.jsx index 239cdc739..a2472e89d 100644 --- a/DashAI/front/src/components/hub/DatasetGrid.jsx +++ b/DashAI/front/src/components/hub/DatasetGrid.jsx @@ -7,27 +7,27 @@ import { TextField, Typography, } from "@mui/material"; -import ArrowBackIcon from "@mui/icons-material/ArrowBack"; import SearchIcon from "@mui/icons-material/Search"; import { useTranslation } from "react-i18next"; import { searchDatasets } from "../../api/hub"; import DatasetCard from "./DatasetCard"; +import HubBreadcrumbs from "./HubBreadcrumbs"; const PAGE_SIZE = 20; /** - * Center panel — debounced search bar + paginated grid of DatasetCard components. + * Center panel — breadcrumbs, debounced search bar, and paginated grid of DatasetCard components. * * @param {string|null} sourceName - Active DatasetSource class name. + * @param {string} sourceDisplayName - Human-readable source name for breadcrumbs. * @param {object|null} selectedDataset - Currently selected DatasetEntry. * @param {function} onSelectDataset - Called with a DatasetEntry when a card is clicked. - * @param {function|null} onBack - If provided, shows a back button to return to source list. */ export default function DatasetGrid({ sourceName, + sourceDisplayName, selectedDataset, onSelectDataset, - onBack = null, }) { const { t } = useTranslation(["hub", "common"]); const [query, setQuery] = useState(""); @@ -118,18 +118,7 @@ export default function DatasetGrid({ gap: 2, }} > - {onBack && ( - - - - )} + navigate("/app/hub"); + + return ( + + + + + + { e.preventDefault(); handleBack(); }} + sx={{ cursor: "pointer" }} + > + {t("hub:title")} + + {sourceDisplayName} + + + ); +} diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index b8e573e31..bd2684ed5 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -1,4 +1,5 @@ import { useCallback, useEffect, useRef, useState } from "react"; +import { useNavigate, useParams } from "react-router-dom"; import HubIcon from "@mui/icons-material/Hub"; import ScienceIcon from "@mui/icons-material/Science"; import CloudDownloadIcon from "@mui/icons-material/CloudDownload"; @@ -33,11 +34,12 @@ const SOURCE_ICONS = { export default function HubContent() { const { t } = useTranslation(["hub"]); + const navigate = useNavigate(); + const { sourceName: sourceNameParam } = useParams(); const threePanelLayout = useThreePanelLayout({ storageKey: "hub" }); const [sources, setSources] = useState([]); const [sourcesLoading, setSourcesLoading] = useState(true); - const [selectedSource, setSelectedSource] = useState(null); const [selectedDataset, setSelectedDataset] = useState(null); const [importMode, setImportMode] = useState(false); const [importStep, setImportStep] = useState(0); @@ -52,7 +54,10 @@ export default function HubContent() { const pollTimerRef = useRef(null); - const sourceName = selectedSource?.name ?? null; + // Derive selected source from URL param + loaded sources list + const selectedSource = sources.find((s) => s.name === sourceNameParam) ?? null; + const sourceDisplayName = + selectedSource?.display_name || selectedSource?.name || sourceNameParam; useEffect(() => { getDatasetSources() @@ -61,6 +66,17 @@ export default function HubContent() { .finally(() => setSourcesLoading(false)); }, []); + // Reset dataset selection when source changes + useEffect(() => { + setSelectedDataset(null); + setImportMode(false); + setImportStep(0); + setSelectedDataloader(null); + setFormValues({}); + setFormHasErrors(false); + setImportDownload(null); + }, [sourceNameParam]); + useEffect(() => { listHubDownloads() .then((rows) => { @@ -101,24 +117,24 @@ export default function HubContent() { const getDownloadForDataset = useCallback( (ds) => { - if (!ds || !sourceName) return null; - return downloads[`${sourceName}::${ds.id}`] ?? null; + if (!ds || !sourceNameParam) return null; + return downloads[`${sourceNameParam}::${ds.id}`] ?? null; }, - [downloads, sourceName], + [downloads, sourceNameParam], ); const handleStartDownload = async () => { - if (!selectedDataset || !sourceName) return; + if (!selectedDataset || !sourceNameParam) return; setDownloadLoading(true); try { const row = await enqueueHubDownloadJob( - sourceName, + sourceNameParam, selectedDataset.id, selectedDataset.name, ); setDownloads((prev) => ({ ...prev, - [`${sourceName}::${selectedDataset.id}`]: row, + [`${sourceNameParam}::${selectedDataset.id}`]: row, })); } catch { // error shown via download status @@ -152,14 +168,7 @@ export default function HubContent() { }; const handleSelectSource = (source) => { - setSelectedSource(source); - setSelectedDataset(null); - setImportMode(false); - setImportStep(0); - setSelectedDataloader(null); - setFormValues({}); - setFormHasErrors(false); - setImportDownload(null); + navigate(`/app/hub/${source.name}`); }; const handleImported = () => { @@ -199,6 +208,10 @@ export default function HubContent() { Icon: SOURCE_ICONS[source.name] ?? CloudDownloadIcon, })); + const importSourceName = importDownload + ? importDownload.source_name + : sourceNameParam; + return ( @@ -215,15 +228,10 @@ export default function HubContent() { - ) : selectedSource ? ( + ) : sourceNameParam ? ( setSelectedSource(null)} /> ) : ( Date: Fri, 8 May 2026 15:37:28 -0400 Subject: [PATCH 069/361] feat: add title-divider-content layout to DatasetDetail right bar --- .../src/components/hub/DatasetDetail.jsx | 196 ++++++++++-------- .../front/src/utils/i18n/locales/en/hub.json | 3 +- .../front/src/utils/i18n/locales/es/hub.json | 3 +- 3 files changed, 110 insertions(+), 92 deletions(-) diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index 03a9e6763..8b1007bd2 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -18,6 +18,7 @@ import AddIcon from "@mui/icons-material/Add"; import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; import { getDatasetInfo } from "../../api/hub"; +import SideBar from "../threeSectionLayout/panelContainers/SideBar"; /** * Right panel — detailed view of a selected Hub dataset with action buttons. @@ -52,25 +53,6 @@ export default function DatasetDetail({ .catch(() => setExtraInfo({})); }, [dataset?.id, sourceName]); - if (!dataset) { - return ( - - - {t("hub:selectDatasetToPreview")} - - - ); - } - const renderActionButton = () => { if (downloadLoading) { return ( @@ -152,90 +134,124 @@ export default function DatasetDetail({ }; return ( - + + {/* Title */} - - {dataset.name} + + {t("hub:datasetDetails")} - - - {renderActionButton()} - {download?.status === "ready" && ( - } - label={t("hub:downloaded")} - size="small" - color="success" - variant="outlined" - /> - )} - - - - {t("hub:viewOnSource")} - - - {(extraInfo?.description || dataset.description) && ( - - {extraInfo?.description || dataset.description} + {/* Content */} + {!dataset ? ( + + + {t("hub:selectDatasetToPreview")} - )} + + ) : ( + + + + {dataset.name} + - + + {renderActionButton()} + {download?.status === "ready" && ( + } + label={t("hub:downloaded")} + size="small" + color="success" + variant="outlined" + /> + )} + - - {dataset.row_count != null && ( - - - {t("hub:rows")} - - - {dataset.row_count.toLocaleString()} - - - )} + + {t("hub:viewOnSource")} + + - {(extraInfo?.tags ?? dataset.tags)?.length > 0 && ( - - - {t("hub:tags")} + + {(extraInfo?.description || dataset.description) && ( + + {extraInfo?.description || dataset.description} - - {(extraInfo?.tags ?? dataset.tags).map((tag) => ( - - ))} - - - )} - - - + )} + + + + + {dataset.row_count != null && ( + + + {t("hub:rows")} + + + {dataset.row_count.toLocaleString()} + + + )} + + {(extraInfo?.tags ?? dataset.tags)?.length > 0 && ( + + + {t("hub:tags")} + + + {(extraInfo?.tags ?? dataset.tags).map((tag) => ( + + ))} + + + )} + + + + )} + ); } diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json index ca9bcce63..6bec460c2 100644 --- a/DashAI/front/src/utils/i18n/locales/en/hub.json +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -52,5 +52,6 @@ "sources": "Sources", "fromSource": "From {{source}}", "selectSourceSubtitle": "Select a source to browse and import datasets.", - "searchDownloads": "Search downloads..." + "searchDownloads": "Search downloads...", + "datasetDetails": "Dataset Details" } diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json index a72f4f8a4..1725471fb 100644 --- a/DashAI/front/src/utils/i18n/locales/es/hub.json +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -52,5 +52,6 @@ "sources": "Fuentes", "fromSource": "De {{source}}", "selectSourceSubtitle": "Selecciona una fuente para explorar e importar datasets.", - "searchDownloads": "Buscar descargas..." + "searchDownloads": "Buscar descargas...", + "datasetDetails": "Detalles del Dataset" } From 0bd08b93f03e0ae0257b138f222f4de712acb809 Mon Sep 17 00:00:00 2001 From: Felipe Date: Sun, 10 May 2026 11:52:45 -0400 Subject: [PATCH 070/361] feat: update spacing and padding across various components for improved layout - Increased padding and margin values in NewItemButton, OptionBox, SearchBar, and SelectOptionMenu for better visual balance. - Adjusted spacing in CustomTooltip and SidebarSection to enhance user experience. - Modified padding in Home, NewPipeline, and PluginsCard to create a more spacious layout. - Updated grid spacing in PluginsContent, PluginsDetails, and PluginsTab for consistency. - Enhanced layout in LiveMetricsChart, MetricsCard, and ResultsDetailsLayout for improved readability. - Increased spacing in ResultsTabInfo and ResultsTabMetrics for better separation of content. - Introduced a new spacing scale in theme.js to standardize layout adjustments across the application. --- .../src/components/DatasetVisualization.jsx | 18 ++--- DashAI/front/src/components/HomeButton.jsx | 2 +- .../front/src/components/LanguageSelector.jsx | 2 +- .../front/src/components/ResponsiveAppBar.jsx | 16 ++--- .../configurableObject/Inputs/ClassInput.jsx | 4 +- .../Inputs/FormInputWrapper.jsx | 4 +- .../Inputs/IntegerInputOptimize.jsx | 6 +- .../Inputs/NumberInputOptimize.jsx | 6 +- .../configurableObject/SplitsParams.jsx | 2 +- .../custom/ComponentDetailsPanel.jsx | 20 +++--- .../components/custom/ComponentSelector.jsx | 32 ++++----- .../src/components/custom/CustomLayout.jsx | 6 +- .../datasets/ConfigureAndUploadDataset.jsx | 4 +- .../datasets/ConvertDatasetModal.jsx | 10 +-- .../components/datasets/ConverterTable.jsx | 2 +- .../datasets/DataloaderConfiguration.jsx | 4 +- .../src/components/datasets/DatasetModal.jsx | 4 +- .../datasets/DatasetPreviewStep.jsx | 4 +- .../datasets/DatasetSummaryModal.jsx | 2 +- .../datasets/DatasetSummaryStep.jsx | 4 +- .../components/datasets/EditDataTypeModal.jsx | 4 +- .../components/datasets/EditDatasetModal.jsx | 6 +- .../datasets/SelectDataloaderStep.jsx | 6 +- .../front/src/components/datasets/Upload.jsx | 6 +- .../ConverterClassColumnModal.jsx | 12 ++-- .../converterModals/ConverterScopeModal.jsx | 26 ++++---- .../explainers/ConfigureExplainerStep.jsx | 6 +- .../explainers/ExplainerConfiguration.jsx | 4 +- .../components/explainers/ExplainersCard.jsx | 16 ++--- .../components/explainers/ExplainersPlot.jsx | 4 +- .../explainers/InlineExplainerCreator.jsx | 8 +-- .../explainers/ParameterListItem.jsx | 4 +- .../explainers/SelectDatasetStep.jsx | 2 +- .../explainers/SetNameAndExplainerStep.jsx | 6 +- .../components/explainers/SplitSelector.jsx | 14 ++-- .../explorations/ExplorationEditor.jsx | 2 +- .../explorations/ExplorationRunner.jsx | 6 +- .../components/explorations/Explorations.jsx | 8 +-- .../explorations/ExplorationsTable.jsx | 4 +- .../components/explorations/ResultsAll.jsx | 6 +- .../explorations/ResultsByExplorer.jsx | 6 +- .../components/explorations/ResultsViewer.jsx | 6 +- .../Steps/ConfigureExplorersStep.jsx | 4 +- .../Visualizations/PlotlyJsonVisualizer.jsx | 4 +- .../explorers/DetailTabs/Info.jsx | 10 +-- .../explorers/EditColumnsDialog.jsx | 16 ++--- .../explorers/ExplorerDetails.jsx | 4 +- .../explorations/explorers/ExplorersTable.jsx | 6 +- .../components/generative/AudioMessage.jsx | 2 +- .../src/components/generative/ChatBubble.jsx | 6 +- .../components/generative/ChatTimeStamp.jsx | 4 +- .../generative/CreateSessionCenter.jsx | 14 ++-- .../generative/CreateSessionRight.jsx | 8 +-- .../generative/GenerativeBreadcrumbs.jsx | 4 +- .../components/generative/GenerativeChat.jsx | 14 ++-- .../generative/InfoSessionModal.jsx | 16 ++--- .../src/components/generative/MediaInput.jsx | 4 +- .../components/generative/MessageContent.jsx | 2 +- .../src/components/generative/ParamsBar.jsx | 12 ++-- .../src/components/generative/SessionBar.jsx | 4 +- .../src/components/generative/SessionBox.jsx | 4 +- .../generative/SessionHistoryModal.jsx | 24 +++---- .../components/generative/VideoMessage.jsx | 2 +- .../mediaInput/MediaAttachPopper.jsx | 2 +- .../mediaInput/MediaOnlyPlaceholder.jsx | 8 +-- .../mediaInput/MediaPreviewList.jsx | 4 +- .../generative/textMessage/CodeBlock.jsx | 6 +- .../generative/textMessage/InlineCode.jsx | 4 +- .../hardware/HardwareMonitorPanel.jsx | 20 +++--- .../src/components/jobs/JobDetailsDialog.jsx | 8 +-- .../src/components/jobs/JobQueueWidget.jsx | 12 ++-- .../src/components/layout/ModuleContainer.jsx | 4 +- .../src/components/models/AddModelDialog.jsx | 16 ++--- .../components/models/CreateSessionSteps.jsx | 14 ++-- .../models/DatasetPredictionPanel.jsx | 4 +- .../components/models/HyperparameterPlots.jsx | 4 +- .../components/models/InfoSessionModal.jsx | 10 +-- .../components/models/LiveMetricsChart.jsx | 2 +- .../models/ManualPredictionPanel.jsx | 12 ++-- .../components/models/ModelCenterContent.jsx | 4 +- .../models/ModelComparisonTable.jsx | 18 ++--- .../src/components/models/ModelParamBlock.jsx | 8 +-- .../components/models/ModelsBreadcrumbs.jsx | 4 +- .../src/components/models/ModelsLeftBar.jsx | 4 +- .../src/components/models/ModelsRightBar.jsx | 6 +- .../src/components/models/PredictionCard.jsx | 18 ++--- .../models/RetrainConfirmDialog.jsx | 6 +- .../front/src/components/models/RunCard.jsx | 42 ++++++------ .../src/components/models/RunResults.jsx | 66 +++++++++---------- .../models/SessionVisualization.jsx | 18 ++--- .../models/SetNameAndDatasetStep.jsx | 2 +- .../src/components/models/TaskSelector.jsx | 4 +- .../models/model/HoverModelInfo.jsx | 6 +- .../components/models/model/ModelListItem.jsx | 4 +- .../modelSession/ConfigureModelsStep.jsx | 6 +- .../modelSession/DivideDatasetColumns.jsx | 10 +-- .../HyperparameterOptimizationStep.jsx | 2 +- .../models/modelSession/ModelsTable.jsx | 6 +- .../models/modelSession/OptimizationTable.jsx | 6 +- .../modelSession/PrepareDatasetStep.jsx | 16 ++--- .../models/modelSession/RunnerDialog.jsx | 8 +-- .../models/modelSession/SelectDatasetStep.jsx | 4 +- .../modelSession/SetNameAndTaskStep.jsx | 6 +- .../models/modelSession/SplitDatasetRows.jsx | 24 +++---- .../modelSession/metrics/MetricCard.jsx | 6 +- .../modelSession/metrics/MetricsSelector.jsx | 10 +-- .../modelSession/metrics/SplitColumn.jsx | 2 +- .../modelSession/runButtons/RunInfoModal.jsx | 26 ++++---- .../components/notebooks/ColumnSelector.jsx | 10 +-- .../components/notebooks/DataBreadcrumbs.jsx | 4 +- .../notebooks/DatasetNotebookLeftBar.jsx | 4 +- .../components/notebooks/DescriptionPanel.jsx | 2 +- .../src/components/notebooks/NoteBox.jsx | 4 +- .../src/components/notebooks/RightBar.jsx | 30 ++++----- .../src/components/notebooks/RowSelector.jsx | 16 ++--- .../notebooks/converter/ConverterBox.jsx | 12 ++-- .../converter/ConverterHistoryList.jsx | 4 +- .../notebooks/converter/ItemsToDeleteList.jsx | 8 +-- .../ConverterTargetColumnModal.jsx | 14 ++-- .../ParameterStepConverter.jsx | 2 +- .../converterCreation/ScopeStepConverter.jsx | 10 +-- .../notebooks/dataset/ColumnInsights.jsx | 20 +++--- .../notebooks/dataset/DatasetSummaryTable.jsx | 2 +- .../dataset/DatasetsCenterContent.jsx | 8 +-- .../dataset/EditableColumnHeader.jsx | 6 +- .../dataset/InferenceReasonPopover.jsx | 12 ++-- .../notebooks/dataset/IssueCard.jsx | 6 +- .../dataset/MrtDatasetTableInfScroll.jsx | 2 +- .../notebooks/dataset/QualityAlerts.jsx | 10 +-- .../components/notebooks/dataset/StatBox.jsx | 2 +- .../notebooks/dataset/header/Header.jsx | 4 +- .../notebooks/dataset/header/HeaderBox.jsx | 6 +- .../notebooks/dataset/tabs/CategoricalTab.jsx | 10 +-- .../dataset/tabs/CorrelationsTab.jsx | 12 ++-- .../notebooks/dataset/tabs/NumericTab.jsx | 20 +++--- .../notebooks/dataset/tabs/OverviewTab.jsx | 8 +-- .../notebooks/dataset/tabs/QualityTab.jsx | 24 +++---- .../notebooks/dataset/tabs/TextTab.jsx | 22 +++---- .../ConfigureAndUploadDatasetStep.jsx | 10 +-- .../datasetCreation/DataloaderConfigBar.jsx | 10 +-- .../DataloaderConfiguration.jsx | 2 +- .../datasetCreation/PreviewDataset.jsx | 10 +-- .../datasetCreation/PreviewDatasetTable.jsx | 8 +-- .../datasetCreation/SaveDatasetModal.jsx | 4 +- .../datasetCreation/SelectDataloaderStep.jsx | 6 +- .../datasetCreation/TypeChangeValidator.jsx | 8 +-- .../notebooks/datasetCreation/Upload.jsx | 6 +- .../datasetCreation/UploadDatasetSteps.jsx | 2 +- .../notebooks/explorer/ExplorerBox.jsx | 10 +-- .../explorer/ExplorerDetailsModal.jsx | 16 ++--- .../plotLayout/ColorscaleSelector.jsx | 16 ++--- .../plotLayout/DebouncedColorPicker.jsx | 2 +- .../explorer/plotLayout/PlotLayoutForm.jsx | 12 ++-- .../plotLayout/forms/DimensionsForm.jsx | 6 +- .../explorer/plotLayout/forms/GeneralForm.jsx | 8 +-- .../explorer/plotLayout/forms/LegendForm.jsx | 4 +- .../explorer/plotLayout/forms/TraceForm.jsx | 2 +- .../explorer/plotLayout/forms/XAxisForm.jsx | 2 +- .../explorer/plotLayout/forms/YAxisForm.jsx | 2 +- .../notebooks/explorer/tabs/Columns.jsx | 6 +- .../notebooks/explorer/tabs/Info.jsx | 10 +-- .../notebooks/explorer/tabs/Parameters.jsx | 6 +- .../visualizations/PlotlyJsonVisualizer.jsx | 2 +- .../ParameterStepExplorer.jsx | 2 +- .../explorerCreation/ScopeStepExplorer.jsx | 6 +- .../notebook/DatasetPreviewNotebook.jsx | 8 +-- .../notebooks/notebook/InfoNotebookModal.jsx | 12 ++-- .../notebook/NotebookHistoryModal.jsx | 4 +- .../notebooks/notebook/NotebookView.jsx | 2 +- .../notebook/NotebookVisualization.jsx | 2 +- .../notebookCreation/DatasetAutocomplete.jsx | 14 ++-- .../notebookCreation/UploadNotebookSteps.jsx | 12 ++-- .../notebooks/tool/ConfigureToolModal.jsx | 20 +++--- .../notebooks/tool/HoverToolInfo.jsx | 8 +-- .../components/notebooks/tool/ToolGrid.jsx | 12 ++-- .../notebooks/tool/ToolGridItem.jsx | 8 +-- .../components/notebooks/tool/ToolList.jsx | 16 ++--- .../notebooks/tool/ToolListItem.jsx | 12 ++-- .../src/components/pipelines/CustomNode.jsx | 2 +- .../src/components/pipelines/NodeSidebar.jsx | 16 ++--- .../components/pipelines/ParamsSettings.jsx | 4 +- .../components/pipelines/PipelineHeader.jsx | 4 +- .../components/pipelines/PipelineToolbar.jsx | 4 +- .../components/pipelines/PipelinesTable.jsx | 4 +- .../pipelines/nodes/ConfigurableNode.jsx | 2 +- .../pipelines/nodes/DataSelectorNode.jsx | 4 +- .../pipelines/nodes/ExplorationModal.jsx | 2 +- .../pipelines/nodes/RetrieveModelNode.jsx | 4 +- .../components/pipelines/nodes/TrainNode.jsx | 10 +-- .../components/pipelines/results/Results.jsx | 4 +- .../pipelines/results/ResultsExploration.jsx | 2 +- .../pipelines/results/ResultsGraphs.jsx | 2 +- .../pipelines/results/ResultsMetrics.jsx | 4 +- .../results/ResultsPredictionModal.jsx | 2 +- .../predictions/DatasetSelector.jsx | 12 ++-- .../src/components/predictions/InputField.jsx | 2 +- .../predictions/ManualInputForm.jsx | 6 +- .../components/predictions/ModeSelector.jsx | 16 ++--- .../predictions/PredictionModal.jsx | 4 +- .../predictions/PredictionsTable.jsx | 8 +-- .../components/predictions/ResultsTable.jsx | 8 +-- .../src/components/shared/BoxWithTitle.jsx | 2 +- .../shared/FormSchemaButtonGroup.jsx | 6 +- .../components/shared/FormSchemaFieldCard.jsx | 38 +++++------ .../shared/FormSchemaFieldWithOptimizers.jsx | 2 +- .../components/shared/FormSchemaHeader.jsx | 2 +- .../components/shared/FormSchemaLayout.jsx | 2 +- .../components/shared/FormSchemaOptimizer.jsx | 4 +- .../shared/FormSchemaRenderFields.jsx | 2 +- .../shared/FormSchemaWithSelectedModel.jsx | 2 +- .../components/shared/NestedListDisplayer.jsx | 6 +- .../shared/SingleSelectChipGroup.jsx | 2 +- .../src/components/shared/TextWithOptions.jsx | 2 +- .../threeSectionLayout/CollapsibleList.jsx | 20 +++--- .../DeleteConfirmationModal.jsx | 2 +- .../GroupedCollapsibleList.jsx | 30 ++++----- .../components/threeSectionLayout/ItemBox.jsx | 2 +- .../threeSectionLayout/ItemMenu.jsx | 2 +- .../threeSectionLayout/NewItemButton.jsx | 2 +- .../threeSectionLayout/OptionBox.jsx | 2 +- .../threeSectionLayout/SearchBar.jsx | 2 +- .../threeSectionLayout/SelectOptionMenu.jsx | 8 +-- .../src/components/tour/CustomTooltip.jsx | 6 +- DashAI/front/src/pages/home/Home.jsx | 30 ++++----- .../front/src/pages/pipelines/NewPipeline.jsx | 4 +- .../pages/plugins/components/PluginsCard.jsx | 14 ++-- .../plugins/components/PluginsContent.jsx | 2 +- .../plugins/components/PluginsDetails.jsx | 8 +-- .../pages/plugins/components/PluginsTab.jsx | 2 +- .../pages/plugins/components/PluginsTags.jsx | 2 +- .../plugins/components/PluginsToolbar.jsx | 6 +- .../results/components/LiveMetricsChart.jsx | 8 +-- .../pages/results/components/MetricsCard.jsx | 6 +- .../components/ResultsDetailsLayout.jsx | 4 +- .../results/components/ResultsDialogViews.jsx | 6 +- .../results/components/ResultsGraphs.jsx | 2 +- .../components/ResultsGraphsParameters.jsx | 10 +-- .../results/components/ResultsGraphsPlot.jsx | 2 +- .../components/ResultsGraphsSelection.jsx | 6 +- .../components/ResultsGraphsSwitch.jsx | 2 +- .../components/ResultsTabHyperparameters.jsx | 2 +- .../results/components/ResultsTabInfo.jsx | 36 +++++----- .../results/components/ResultsTabInfoDate.jsx | 4 +- .../results/components/ResultsTabInfoName.jsx | 4 +- .../results/components/ResultsTabMetrics.jsx | 2 +- .../components/ResultsTabMetricsRuns.jsx | 10 +-- .../ResultsTabParametersDisplayList.jsx | 4 +- .../results/components/ResultsTableLayout.jsx | 4 +- DashAI/front/src/styles/theme.js | 20 ++++++ 249 files changed, 1019 insertions(+), 1005 deletions(-) diff --git a/DashAI/front/src/components/DatasetVisualization.jsx b/DashAI/front/src/components/DatasetVisualization.jsx index 8d9c967c6..be0f80a4d 100644 --- a/DashAI/front/src/components/DatasetVisualization.jsx +++ b/DashAI/front/src/components/DatasetVisualization.jsx @@ -187,11 +187,11 @@ export default function DatasetVisualization({ return ( <> {/* Quick Stats Section */} {!isProcessing && datasetInfo && ( - + {/* Dataset quality score */} - + {dataset.name} - + @@ -268,7 +268,7 @@ export default function DatasetVisualization({ sx={{ display: "flex", alignItems: "flex-end", - gap: 2, + gap: 4, flexDirection: "column", }} > @@ -276,7 +276,7 @@ export default function DatasetVisualization({ sx={{ minHeight: "40px", display: "flex", - gap: 2, + gap: 4, flexWrap: "wrap", justifyContent: "flex-start", }} @@ -421,7 +421,7 @@ export default function DatasetVisualization({ {/* Divider */} - + {/* Content based on selected tab */} {tab === 0 && ( @@ -465,7 +465,7 @@ export default function DatasetVisualization({ alignItems: "center", minHeight: 200, flexDirection: "column", - gap: 2, + gap: 4, }} > diff --git a/DashAI/front/src/components/HomeButton.jsx b/DashAI/front/src/components/HomeButton.jsx index a20e228f7..97331cc63 100644 --- a/DashAI/front/src/components/HomeButton.jsx +++ b/DashAI/front/src/components/HomeButton.jsx @@ -163,7 +163,7 @@ function HomeButton({ color: theme.palette.text.disabled, transition: "color 0.15s, transform 0.15s", flexShrink: 0, - ml: 1, + ml: 2, }} > → diff --git a/DashAI/front/src/components/LanguageSelector.jsx b/DashAI/front/src/components/LanguageSelector.jsx index a5f40545f..c7f7c83b6 100644 --- a/DashAI/front/src/components/LanguageSelector.jsx +++ b/DashAI/front/src/components/LanguageSelector.jsx @@ -23,7 +23,7 @@ export default function LanguageSelector() { "& .MuiSelect-select": { display: "flex", alignItems: "center", - gap: 1, + gap: 2, }, }} > diff --git a/DashAI/front/src/components/ResponsiveAppBar.jsx b/DashAI/front/src/components/ResponsiveAppBar.jsx index 6ffd22db0..5890cdb63 100644 --- a/DashAI/front/src/components/ResponsiveAppBar.jsx +++ b/DashAI/front/src/components/ResponsiveAppBar.jsx @@ -71,7 +71,7 @@ function ResponsiveAppBar() { }} > {/* Logo */} @@ -108,8 +108,8 @@ function ResponsiveAppBar() { border: `1px solid ${theme.palette.accent.amberBorder}`, background: theme.palette.accent.amberDim, borderRadius: "2px", - px: "7px", - py: "2px", + px: 2, + py: 1, lineHeight: 1.4, }} > @@ -181,7 +181,7 @@ function ResponsiveAppBar() { component={RouterLink} to="/app" disableRipple - sx={{ ...iconBtnSx, mr: 1, alignSelf: "center" }} + sx={{ ...iconBtnSx, mr: 2, alignSelf: "center" }} aria-label="home" > @@ -197,7 +197,7 @@ function ResponsiveAppBar() { sx={{ display: "flex", alignItems: "center", - px: "18px", + px: 4, height: "100%", textDecoration: "none", ...theme.typography.tabLabel, @@ -232,7 +232,7 @@ function ResponsiveAppBar() { sx={{ display: "flex", alignItems: "center", - gap: { xs: 0.5, sm: 1 }, + gap: { xs: 1, sm: 2 }, flexShrink: 0, }} > diff --git a/DashAI/front/src/components/configurableObject/Inputs/ClassInput.jsx b/DashAI/front/src/components/configurableObject/Inputs/ClassInput.jsx index 89c69c40c..1e696b57d 100644 --- a/DashAI/front/src/components/configurableObject/Inputs/ClassInput.jsx +++ b/DashAI/front/src/components/configurableObject/Inputs/ClassInput.jsx @@ -146,8 +146,8 @@ function ClassInput({ {/* Option 1: Collapsible component that contains the subform */} - - + + {!loading && ( + {children} - + diff --git a/DashAI/front/src/components/configurableObject/Inputs/IntegerInputOptimize.jsx b/DashAI/front/src/components/configurableObject/Inputs/IntegerInputOptimize.jsx index 0034b4d7d..d6275dc7d 100644 --- a/DashAI/front/src/components/configurableObject/Inputs/IntegerInputOptimize.jsx +++ b/DashAI/front/src/components/configurableObject/Inputs/IntegerInputOptimize.jsx @@ -116,12 +116,12 @@ function OptimizeIntegerInput({ )} {canOptimize && switchState ? ( - + {t("lowerBound")} @@ -142,7 +142,7 @@ function OptimizeIntegerInput({ {t("upperBound")} diff --git a/DashAI/front/src/components/configurableObject/Inputs/NumberInputOptimize.jsx b/DashAI/front/src/components/configurableObject/Inputs/NumberInputOptimize.jsx index 018eee13c..23651205b 100644 --- a/DashAI/front/src/components/configurableObject/Inputs/NumberInputOptimize.jsx +++ b/DashAI/front/src/components/configurableObject/Inputs/NumberInputOptimize.jsx @@ -116,12 +116,12 @@ function OptimizeNumberInput({ )} {canOptimize && switchState ? ( - + {t("lowerBound")} @@ -141,7 +141,7 @@ function OptimizeNumberInput({ {t("upperBound")} diff --git a/DashAI/front/src/components/configurableObject/SplitsParams.jsx b/DashAI/front/src/components/configurableObject/SplitsParams.jsx index 1cbbd76db..a8fc4dfcd 100644 --- a/DashAI/front/src/components/configurableObject/SplitsParams.jsx +++ b/DashAI/front/src/components/configurableObject/SplitsParams.jsx @@ -63,7 +63,7 @@ function SplitsParams({ {/* splits configuration form that can be hidden or shown as needed. */} - + Splits configuration {showSplitsError && ( diff --git a/DashAI/front/src/components/custom/ComponentDetailsPanel.jsx b/DashAI/front/src/components/custom/ComponentDetailsPanel.jsx index f2ef101da..41d4161f6 100644 --- a/DashAI/front/src/components/custom/ComponentDetailsPanel.jsx +++ b/DashAI/front/src/components/custom/ComponentDetailsPanel.jsx @@ -82,7 +82,7 @@ function ComponentDetailsPanel({ {/* Title */} ) : ( - - + + {getIcon?.(component) && ( {t("description")} - + {getDescription(component) ? ( ) : ( @@ -174,10 +174,10 @@ function ComponentDetailsPanel({ {(component.schema?.tags || component.metadata?.tags).map( (tag) => ( @@ -196,7 +196,7 @@ function ComponentDetailsPanel({ {extraSections && extraSections.map((section) => ( - + {section.title} - {section.content} + {section.content} ))} diff --git a/DashAI/front/src/components/custom/ComponentSelector.jsx b/DashAI/front/src/components/custom/ComponentSelector.jsx index e2b2ad2bb..a5b18b359 100644 --- a/DashAI/front/src/components/custom/ComponentSelector.jsx +++ b/DashAI/front/src/components/custom/ComponentSelector.jsx @@ -101,7 +101,7 @@ function ComponentSelector({ const handleSelect = (component) => onSelect?.(component); return ( - + - + {categories.map((cat) => { const isActive = activeCategory === cat; const count = cat === ALL_CATEGORY ? components.length : counts[cat]; @@ -144,8 +144,8 @@ function ComponentSelector({ })} - - + + {Object.entries(grouped).map(([cat, items]) => { const isOpen = expanded.has(cat); return ( @@ -156,13 +156,13 @@ function ComponentSelector({ display: "flex", alignItems: "center", justifyContent: "space-between", - py: 1, + py: 2, cursor: "pointer", color: "text.secondary", "&:hover": { color: "text.primary" }, }} > - + {items.map((component) => { @@ -201,9 +201,9 @@ function ComponentSelector({ elevation={0} onClick={() => handleSelect(component)} sx={{ - p: 1.5, + p: 3, display: "flex", - gap: 1.5, + gap: 3, alignItems: "flex-start", cursor: "pointer", border: 1, @@ -222,7 +222,7 @@ function ComponentSelector({ {icon && ( {getDescription( @@ -268,7 +268,7 @@ function ComponentSelector({ )} @@ -284,11 +284,11 @@ function ComponentSelector({ - + {emptyText ?? t("noItemsFound")} @@ -302,7 +302,7 @@ function ComponentSelector({ - + {title && ( - + {title} )} {subtitle && ( - + {subtitle} )} diff --git a/DashAI/front/src/components/datasets/ConfigureAndUploadDataset.jsx b/DashAI/front/src/components/datasets/ConfigureAndUploadDataset.jsx index f8bef7796..9a4a27d3e 100644 --- a/DashAI/front/src/components/datasets/ConfigureAndUploadDataset.jsx +++ b/DashAI/front/src/components/datasets/ConfigureAndUploadDataset.jsx @@ -67,13 +67,13 @@ function ConfigureAndUploadDataset({ }, [error, newDataset.file]); return ( - + {/* Upload file */} diff --git a/DashAI/front/src/components/datasets/ConvertDatasetModal.jsx b/DashAI/front/src/components/datasets/ConvertDatasetModal.jsx index 97b73c935..df556d9c7 100644 --- a/DashAI/front/src/components/datasets/ConvertDatasetModal.jsx +++ b/DashAI/front/src/components/datasets/ConvertDatasetModal.jsx @@ -200,12 +200,12 @@ function ConvertDatasetModal({ datasetId }) { direction="row" justifyContent="space-around" alignItems="stretch" - rowGap={2} + rowGap={4} onClick={(event) => event.stopPropagation()} > {/* Dataset summary table */} - + Dataset summary @@ -216,10 +216,10 @@ function ConvertDatasetModal({ datasetId }) { size={{ xs: 12 }} display={"flex"} alignItems={"center"} - gap={2} + gap={4} > - + List of converters - + Class/Target column index , ].filter(Boolean); - return {actions}; + return {actions}; }, }, ], diff --git a/DashAI/front/src/components/datasets/DataloaderConfiguration.jsx b/DashAI/front/src/components/datasets/DataloaderConfiguration.jsx index 496f6643a..0fa6ebfb9 100644 --- a/DashAI/front/src/components/datasets/DataloaderConfiguration.jsx +++ b/DashAI/front/src/components/datasets/DataloaderConfiguration.jsx @@ -24,8 +24,8 @@ function DataloaderConfiguration({ }; return ( - - + + {/* Form title */} diff --git a/DashAI/front/src/components/datasets/DatasetModal.jsx b/DashAI/front/src/components/datasets/DatasetModal.jsx index 50478c287..a886a3202 100644 --- a/DashAI/front/src/components/datasets/DatasetModal.jsx +++ b/DashAI/front/src/components/datasets/DatasetModal.jsx @@ -246,12 +246,12 @@ function DatasetModal({ open, setOpen, updateDatasets }) { > {/* Title */} - + New dataset diff --git a/DashAI/front/src/components/datasets/DatasetPreviewStep.jsx b/DashAI/front/src/components/datasets/DatasetPreviewStep.jsx index aab7ff042..ee2a41d66 100644 --- a/DashAI/front/src/components/datasets/DatasetPreviewStep.jsx +++ b/DashAI/front/src/components/datasets/DatasetPreviewStep.jsx @@ -19,7 +19,7 @@ function DatasetPreviewStep({ return ( @@ -28,7 +28,7 @@ function DatasetPreviewStep({ item variant="caption" component="h3" - sx={{ mb: 2, color: "grey" }} + sx={{ mb: 4, color: "grey" }} > Summary of the recently uploaded dataset with predefined column types. You can modify the type by selecting a different value. diff --git a/DashAI/front/src/components/datasets/DatasetSummaryModal.jsx b/DashAI/front/src/components/datasets/DatasetSummaryModal.jsx index 832b89fe3..777fefbd5 100644 --- a/DashAI/front/src/components/datasets/DatasetSummaryModal.jsx +++ b/DashAI/front/src/components/datasets/DatasetSummaryModal.jsx @@ -42,7 +42,7 @@ function DatasetSummaryModal({ datasetId }) { direction="row" justifyContent="space-around" alignItems="stretch" - spacing={2} + spacing={4} onClick={(event) => event.stopPropagation()} > {/* Dataset Summary Table */} diff --git a/DashAI/front/src/components/datasets/DatasetSummaryStep.jsx b/DashAI/front/src/components/datasets/DatasetSummaryStep.jsx index 139d2e9e8..629ead3e0 100644 --- a/DashAI/front/src/components/datasets/DatasetSummaryStep.jsx +++ b/DashAI/front/src/components/datasets/DatasetSummaryStep.jsx @@ -17,7 +17,7 @@ function DatasetSummaryStep({ return ( @@ -26,7 +26,7 @@ function DatasetSummaryStep({ item variant="caption" component="h3" - sx={{ mb: 2, color: "grey" }} + sx={{ mb: 4, color: "grey" }} > Summary of the recently uploaded dataset with predefined column types. You can modify the type by selecting a different value. diff --git a/DashAI/front/src/components/datasets/EditDataTypeModal.jsx b/DashAI/front/src/components/datasets/EditDataTypeModal.jsx index 199d0df8f..2a34ab0ce 100644 --- a/DashAI/front/src/components/datasets/EditDataTypeModal.jsx +++ b/DashAI/front/src/components/datasets/EditDataTypeModal.jsx @@ -68,7 +68,7 @@ function EditDataTypeModal({ datasetId, updateDatasets }) { event.stopPropagation()}> @@ -76,7 +76,7 @@ function EditDataTypeModal({ datasetId, updateDatasets }) { Summary of the dataset. You can modify the type by selecting a different value. diff --git a/DashAI/front/src/components/datasets/EditDatasetModal.jsx b/DashAI/front/src/components/datasets/EditDatasetModal.jsx index eb55ff56d..b960fa450 100644 --- a/DashAI/front/src/components/datasets/EditDatasetModal.jsx +++ b/DashAI/front/src/components/datasets/EditDatasetModal.jsx @@ -67,11 +67,11 @@ function EditDatasetModal({ datasetId, name, updateDatasets }) { direction="row" justifyContent="space-around" alignItems="stretch" - spacing={2} + spacing={4} > {/* New name field */} - + Enter a new name for your dataset @@ -82,7 +82,7 @@ function EditDatasetModal({ datasetId, name, updateDatasets }) { autoComplete="off" fullWidth onChange={(event) => setDatasetName(event.target.value)} - sx={{ mb: 2 }} + sx={{ mb: 4 }} /> diff --git a/DashAI/front/src/components/datasets/SelectDataloaderStep.jsx b/DashAI/front/src/components/datasets/SelectDataloaderStep.jsx index 0c2cd9209..8ea9f3181 100644 --- a/DashAI/front/src/components/datasets/SelectDataloaderStep.jsx +++ b/DashAI/front/src/components/datasets/SelectDataloaderStep.jsx @@ -67,16 +67,16 @@ function SelectDataloaderStep({ newDataset, setNewDataset, setNextEnabled }) { direction="column" justifyContent="space-around" alignItems="stretch" - spacing={2} + spacing={4} > {/* Title */} {`Select a way to upload your data`} {/* General information about supported formats */} - + Supported formats:
• Single files: Upload individual data files diff --git a/DashAI/front/src/components/datasets/Upload.jsx b/DashAI/front/src/components/datasets/Upload.jsx index 80db095d3..5f984ad4e 100644 --- a/DashAI/front/src/components/datasets/Upload.jsx +++ b/DashAI/front/src/components/datasets/Upload.jsx @@ -124,8 +124,8 @@ function Upload({ onFileUpload }) { }; return ( - - + + {/* state text */} @@ -163,7 +163,7 @@ function Upload({ onFileUpload }) { > setOpen(false)}> - + Set column
- + Class/Target Column @@ -196,7 +196,7 @@ const ConverterClassColumnModal = ({ Select one column to be used as the target variable for supervised learning. @@ -206,8 +206,8 @@ const ConverterClassColumnModal = ({ - - - + @@ -193,7 +193,7 @@ export default function ExplainersCard({ // Full mode for standalone page return ( - + - + {isRunning && } {!loading && isLocal && ( - + Select an instance - + {Object.entries(run.optimizer_parameters).map( ([key, value]) => ( @@ -310,7 +310,7 @@ export default function RunInfoModal({ - + diff --git a/DashAI/front/src/components/notebooks/ColumnSelector.jsx b/DashAI/front/src/components/notebooks/ColumnSelector.jsx index 6ff78a76a..17e6bbaf4 100644 --- a/DashAI/front/src/components/notebooks/ColumnSelector.jsx +++ b/DashAI/front/src/components/notebooks/ColumnSelector.jsx @@ -320,8 +320,8 @@ function ColumnSelector({ {/* Column requirements */} {Object.keys(inputCardinality).length > 0 && ( - + {t("datasets:label.requiredColumns", { exact: inputCardinality.exact, min: inputCardinality.min || 0, @@ -365,7 +365,7 @@ function ColumnSelector({ sx={{ color: "rgba(255, 255, 255, 0.5)", fontStyle: "italic", - mt: 1, + mt: 2, }} > @@ -383,7 +383,7 @@ function ColumnSelector({ {allowedDtypes.length > 0 && ( {t("common:allowedTypes")}:{" "} {showBackButton && ( diff --git a/DashAI/front/src/components/notebooks/DatasetNotebookLeftBar.jsx b/DashAI/front/src/components/notebooks/DatasetNotebookLeftBar.jsx index bb73cacd4..979c0c32e 100644 --- a/DashAI/front/src/components/notebooks/DatasetNotebookLeftBar.jsx +++ b/DashAI/front/src/components/notebooks/DatasetNotebookLeftBar.jsx @@ -132,7 +132,7 @@ export default function DatasetsNotebooksLeftBar({ onToggle }) { return ( {/* Create new item button */} - + {selectedDatasetId || selectedNotebookId ? ( {/* Search bar global */} - + {t("common:note")} diff --git a/DashAI/front/src/components/notebooks/RightBar.jsx b/DashAI/front/src/components/notebooks/RightBar.jsx index 4bf740568..5bf0638ab 100644 --- a/DashAI/front/src/components/notebooks/RightBar.jsx +++ b/DashAI/front/src/components/notebooks/RightBar.jsx @@ -33,10 +33,10 @@ function SectionHeader({ icon: Icon, label, count, mt, theme, t }) { sx={{ display: "flex", alignItems: "center", - gap: 1, - mb: 1.5, + gap: 2, + mb: 3, mt: mt ?? 0, - pb: 0.5, + pb: 1, borderBottom: "1px solid", borderColor: theme.palette.divider, }} @@ -67,7 +67,7 @@ function RightBarDatasetView() { display: "flex", alignItems: "center", justifyContent: "center", - p: 2, + p: 4, }} > + + {t("datasets:label.explore")} @@ -397,7 +397,7 @@ export default function RightBar({ notebook, onToggle }) { + {t("datasets:label.convert")} @@ -417,7 +417,7 @@ export default function RightBar({ notebook, onToggle }) { {/* Search bar */} 0; const hasConverters = filteredConverters.length > 0; @@ -525,7 +525,7 @@ export default function RightBar({ notebook, onToggle }) { icon={TransformIcon} label={t("datasets:label.convert")} count={filteredConverters.length} - mt={hasExplorers ? 3 : 0} + mt={hasExplorers ? 6 : 0} theme={theme} t={t} /> @@ -542,7 +542,7 @@ export default function RightBar({ notebook, onToggle }) { sx={{ color: "text.secondary", textAlign: "center", - py: 2, + py: 4, }} > {t("datasets:label.noToolsMatched")} diff --git a/DashAI/front/src/components/notebooks/RowSelector.jsx b/DashAI/front/src/components/notebooks/RowSelector.jsx index c927143ac..3372adc57 100644 --- a/DashAI/front/src/components/notebooks/RowSelector.jsx +++ b/DashAI/front/src/components/notebooks/RowSelector.jsx @@ -112,8 +112,8 @@ export function RowSelector({ totalRows, onSelectionChange, initialRows }) { }; return ( - - + + {t("datasets:label.selectionMode")} @@ -122,7 +122,7 @@ export function RowSelector({ totalRows, onSelectionChange, initialRows }) { direction="row" alignItems="center" justifyContent="space-between" - sx={{ mt: 1 }} + sx={{ mt: 2 }} > {selectionMode === "range" ? ( - - + + ) : ( - + )} - - + + - + @@ -161,7 +161,7 @@ export default function ConverterBox({ {converterComponent.display_name} - + {/* Descripción */} - + {converterComponent.description} @@ -240,7 +240,7 @@ export default function ConverterBox({ justifyContent: "center", }} > - + {t("common:processing")} )} diff --git a/DashAI/front/src/components/notebooks/converter/ConverterHistoryList.jsx b/DashAI/front/src/components/notebooks/converter/ConverterHistoryList.jsx index 7ce91f97b..381498c6c 100644 --- a/DashAI/front/src/components/notebooks/converter/ConverterHistoryList.jsx +++ b/DashAI/front/src/components/notebooks/converter/ConverterHistoryList.jsx @@ -35,7 +35,7 @@ export default function ConverterHistoryList({ width: "100%", }} > - + onConverterDelete(converter)} size="small" - sx={{ ml: 1 }} + sx={{ ml: 2 }} color="error" > diff --git a/DashAI/front/src/components/notebooks/converter/ItemsToDeleteList.jsx b/DashAI/front/src/components/notebooks/converter/ItemsToDeleteList.jsx index ee3dde180..47d80a769 100644 --- a/DashAI/front/src/components/notebooks/converter/ItemsToDeleteList.jsx +++ b/DashAI/front/src/components/notebooks/converter/ItemsToDeleteList.jsx @@ -10,14 +10,14 @@ const ItemsToDeleteList = React.memo(function ItemsToDeleteList({ items }) { return ( - + {t("common:itemsToBeDeleted")} @@ -36,7 +36,7 @@ const ItemsToDeleteList = React.memo(function ItemsToDeleteList({ items }) { sx={{ display: "flex", alignItems: "center", - py: 0.5, + py: 1, fontWeight: isSelected ? "bold" : "normal", //color: isSelected ? "#00BEBB" : "text.secondary", }} diff --git a/DashAI/front/src/components/notebooks/converterCreation/ConverterTargetColumnModal.jsx b/DashAI/front/src/components/notebooks/converterCreation/ConverterTargetColumnModal.jsx index c80c97c95..365f37b14 100644 --- a/DashAI/front/src/components/notebooks/converterCreation/ConverterTargetColumnModal.jsx +++ b/DashAI/front/src/components/notebooks/converterCreation/ConverterTargetColumnModal.jsx @@ -158,7 +158,7 @@ const ConverterTargetColumnModal = ({ variant="outlined" size="small" sx={{ - mr: 1, + mr: 2, color: classColumnInitialValue === null ? "error.main" : "inherit", borderColor: classColumnInitialValue === null ? "error.main" : "inherit", @@ -192,22 +192,22 @@ const ConverterTargetColumnModal = ({ setOpen(false)}> - + {t("datasets:button.setColumn")} - + - + {t("datasets:label.classTargetColumn")} {t("datasets:label.selectTargetColumnDescription")} @@ -216,8 +216,8 @@ const ConverterTargetColumnModal = ({ - - )} {" "} - + {!isLastStep && ( {loading && ( - + )} {!loading && !error && ( - + + {[PluginStatus.INSTALLED, PluginStatus.DOWNLOADED].includes( plugin.status, diff --git a/DashAI/front/src/pages/plugins/components/PluginsTab.jsx b/DashAI/front/src/pages/plugins/components/PluginsTab.jsx index 25f94be0d..10e82a72e 100644 --- a/DashAI/front/src/pages/plugins/components/PluginsTab.jsx +++ b/DashAI/front/src/pages/plugins/components/PluginsTab.jsx @@ -47,7 +47,7 @@ function PluginsTab({ refreshPluginsFlag, setRefreshPluginsFlag }) { justifyContent={"space-between"} > - + {t("plugins:label.plugins", { label })} diff --git a/DashAI/front/src/pages/plugins/components/PluginsTags.jsx b/DashAI/front/src/pages/plugins/components/PluginsTags.jsx index 18f605580..049223870 100644 --- a/DashAI/front/src/pages/plugins/components/PluginsTags.jsx +++ b/DashAI/front/src/pages/plugins/components/PluginsTags.jsx @@ -9,7 +9,7 @@ import PropTypes from "prop-types"; */ function PluginTags({ tags }) { return ( - + {tags.map((tag, i) => ( ))} diff --git a/DashAI/front/src/pages/plugins/components/PluginsToolbar.jsx b/DashAI/front/src/pages/plugins/components/PluginsToolbar.jsx index 0e33615af..9f957e34b 100644 --- a/DashAI/front/src/pages/plugins/components/PluginsToolbar.jsx +++ b/DashAI/front/src/pages/plugins/components/PluginsToolbar.jsx @@ -35,8 +35,8 @@ function PluginsToolbar({ ]; return ( - - + + - + + - setSplit(v)} sx={{ mb: 2 }}> + setSplit(v)} sx={{ mb: 4 }}> @@ -364,7 +364,7 @@ export function LiveMetricsChart({ run }) { )} - + - + {updatedTabs.map((tab) => ( ))} - + {currentTab === 0 && ( )} diff --git a/DashAI/front/src/pages/results/components/ResultsDialogViews.jsx b/DashAI/front/src/pages/results/components/ResultsDialogViews.jsx index c0880eee4..d761feb50 100644 --- a/DashAI/front/src/pages/results/components/ResultsDialogViews.jsx +++ b/DashAI/front/src/pages/results/components/ResultsDialogViews.jsx @@ -12,14 +12,14 @@ function ResultsDialogViews({ showTable, handleShowTable, handleShowGraphs }) { const theme = useTheme(); return ( - - + + {t("models:label.viewResultsAs")} - + - )} - {" "} - {!isLastStep && ( )} - - {isInteractive ? ( - - {index + 1} of {size} - - ) : ( - - )} + + 0 && !step.disableBackButton + ? backProps?.onClick + : undefined + } + onNext={!isInteractive ? primaryProps?.onClick : undefined} + showBack={index > 0 && !step.disableBackButton} + showNext={!isInteractive} + nextLabel={isLastStep ? t("common:finish") : t("common:next")} + sx={{ mt: 0, pt: 0, borderTop: 0 }} + /> ); diff --git a/DashAI/front/src/components/tour/TourProvider.jsx b/DashAI/front/src/components/tour/TourProvider.jsx index 8ae4d2e72..a828ed908 100644 --- a/DashAI/front/src/components/tour/TourProvider.jsx +++ b/DashAI/front/src/components/tour/TourProvider.jsx @@ -134,6 +134,8 @@ export const TourProvider = ({ scrollOffset={100} floaterProps={{ disableFlip: true, + hideArrow: true, + offset: 10, }} /> {children} diff --git a/DashAI/front/src/components/tour/tourStyles.js b/DashAI/front/src/components/tour/tourStyles.js index 4c07c10af..50229f9e8 100644 --- a/DashAI/front/src/components/tour/tourStyles.js +++ b/DashAI/front/src/components/tour/tourStyles.js @@ -7,7 +7,7 @@ export const tourStyles = { primaryColor: "#ef9f27", spotlightShadow: "0 0 15px rgba(0, 0, 0, 0.5)", textColor: "#333", - width: 380, + width: 280, zIndex: 10000, }, beacon: { diff --git a/DashAI/front/src/utils/i18n/locales/en/common.json b/DashAI/front/src/utils/i18n/locales/en/common.json index 18b7a6bc5..7782ebc1a 100644 --- a/DashAI/front/src/utils/i18n/locales/en/common.json +++ b/DashAI/front/src/utils/i18n/locales/en/common.json @@ -117,6 +117,7 @@ "started": "Started", "startTime": "Start Time", "status": "Status", + "step": "Step", "submit": "Submit", "submitting": "Submitting...", "table": "Table", diff --git a/DashAI/front/src/utils/i18n/locales/es/common.json b/DashAI/front/src/utils/i18n/locales/es/common.json index 86cf2af56..70619e552 100644 --- a/DashAI/front/src/utils/i18n/locales/es/common.json +++ b/DashAI/front/src/utils/i18n/locales/es/common.json @@ -117,6 +117,7 @@ "started": "Iniciado", "startTime": "Hora de Inicio", "status": "Estado", + "step": "Paso", "submit": "Enviar", "submitting": "Enviando...", "table": "Tabla", From 2f30f752c67193188b9404c24ea50d843817a1cf Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 12 May 2026 12:35:36 -0400 Subject: [PATCH 095/361] Refactor tooltip styles and enhance spotlight effect for improved onboarding experience --- .../src/components/tour/CustomTooltip.jsx | 41 ++++++++----------- .../src/components/tour/TourProvider.jsx | 28 +++++++++++++ .../front/src/components/tour/tourStyles.js | 6 +-- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/DashAI/front/src/components/tour/CustomTooltip.jsx b/DashAI/front/src/components/tour/CustomTooltip.jsx index 9c53523dc..008143c92 100644 --- a/DashAI/front/src/components/tour/CustomTooltip.jsx +++ b/DashAI/front/src/components/tour/CustomTooltip.jsx @@ -5,19 +5,16 @@ import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; import StepperNavigationFooter from "../shared/StepperNavigationFooter"; -// Half-diagonal of the rotated square = visible arrow protrusion in px const ARROW = 8; -function getArrowSx(placement) { +function getArrowSx(placement, bg) { if (!placement || placement === "center" || placement === "auto") return null; const base = { position: "absolute", width: ARROW * 2, height: ARROW * 2, - backgroundColor: "#fff", + backgroundColor: bg, transform: "rotate(45deg)", - // lower z-index so it sits behind the tooltip content but is part of the - // parent's filter stacking context zIndex: 0, }; if (placement.startsWith("top")) @@ -45,17 +42,18 @@ export const CustomTooltip = ({ const { t } = useTranslation(["common"]); const theme = useTheme(); const isInteractive = step.isInteractive; - const arrowSx = getArrowSx(step.placement); + + const bg = theme.palette.background.paper; + const arrowSx = getArrowSx(step.placement, bg); return ( - {/* Custom arrow — part of the Box's filter context */} {arrowSx && } - {/* Close button */} - {/* Step counter */} - {/* Content */} {step.content} - {/* Footer: Skip (left) | Back + Next (right) */} {t("common:skipTour")} diff --git a/DashAI/front/src/components/tour/TourProvider.jsx b/DashAI/front/src/components/tour/TourProvider.jsx index a828ed908..63209e4d6 100644 --- a/DashAI/front/src/components/tour/TourProvider.jsx +++ b/DashAI/front/src/components/tour/TourProvider.jsx @@ -7,6 +7,7 @@ import React, { useRef, } from "react"; import Joyride from "react-joyride"; +import GlobalStyles from "@mui/material/GlobalStyles"; import { useTranslation } from "react-i18next"; import { useTour } from "../../hooks/useTour"; import { tours } from "../../constants/tours"; @@ -14,6 +15,32 @@ import { tourStyles } from "./tourStyles"; import { CustomTooltip } from "./CustomTooltip"; import { useTourRegistry } from "../../contexts/TourRegistryContext"; +// hard-light blend: result = 1 - 2*(1-src)*(1-dst) when src > 0.5 +// Inverse: src = 1 - (1-target) / (2*(1-dst)) +// For near-black dst ≈ 0, simplifies to src ≈ (1 + target) / 2 +function hardLightInverse(hexColor) { + const r = parseInt(hexColor.slice(1, 3), 16); + const g = parseInt(hexColor.slice(3, 5), 16); + const b = parseInt(hexColor.slice(5, 7), 16); + const ri = Math.min(255, Math.round((255 + r) / 2)); + const gi = Math.min(255, Math.round((255 + g) / 2)); + const bi = Math.min(255, Math.round((255 + b) / 2)); + return `rgb(${ri},${gi},${bi})`; +} + +const SpotlightHighlight = () => ( + { + const c = hardLightInverse(theme.palette.primary.main); + return { + ".react-joyride__spotlight": { + boxShadow: `0 0 0 2px ${c}, 0 0 14px ${c} !important`, + }, + }; + }} + /> +); + const TourContext = createContext(null); export const useTourContext = () => useContext(TourContext); @@ -115,6 +142,7 @@ export const TourProvider = ({ return ( + Date: Tue, 12 May 2026 12:57:17 -0400 Subject: [PATCH 096/361] Enhance tooltip design by adding border styles and improving shadow effects for better visibility --- .../src/components/tour/CustomTooltip.jsx | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/DashAI/front/src/components/tour/CustomTooltip.jsx b/DashAI/front/src/components/tour/CustomTooltip.jsx index 008143c92..cc67d2339 100644 --- a/DashAI/front/src/components/tour/CustomTooltip.jsx +++ b/DashAI/front/src/components/tour/CustomTooltip.jsx @@ -1,14 +1,15 @@ import React from "react"; import { Box, Button, IconButton, Typography } from "@mui/material"; import CloseIcon from "@mui/icons-material/Close"; -import { useTheme } from "@mui/material/styles"; +import { useTheme, alpha } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; import StepperNavigationFooter from "../shared/StepperNavigationFooter"; const ARROW = 8; -function getArrowSx(placement, bg) { +function getArrowSx(placement, bg, borderColor) { if (!placement || placement === "center" || placement === "auto") return null; + const border = `1px solid ${borderColor}`; const base = { position: "absolute", width: ARROW * 2, @@ -18,13 +19,37 @@ function getArrowSx(placement, bg) { zIndex: 0, }; if (placement.startsWith("top")) - return { ...base, bottom: -ARROW, left: `calc(50% - ${ARROW}px)` }; + return { + ...base, + bottom: -ARROW, + left: `calc(50% - ${ARROW}px)`, + borderBottom: border, + borderRight: border, + }; if (placement.startsWith("bottom")) - return { ...base, top: -ARROW, left: `calc(50% - ${ARROW}px)` }; + return { + ...base, + top: -ARROW, + left: `calc(50% - ${ARROW}px)`, + borderTop: border, + borderLeft: border, + }; if (placement.startsWith("left")) - return { ...base, right: -ARROW, top: `calc(50% - ${ARROW}px)` }; + return { + ...base, + right: -ARROW, + top: `calc(50% - ${ARROW}px)`, + borderTop: border, + borderRight: border, + }; if (placement.startsWith("right")) - return { ...base, left: -ARROW, top: `calc(50% - ${ARROW}px)` }; + return { + ...base, + left: -ARROW, + top: `calc(50% - ${ARROW}px)`, + borderBottom: border, + borderLeft: border, + }; return null; } @@ -44,7 +69,8 @@ export const CustomTooltip = ({ const isInteractive = step.isInteractive; const bg = theme.palette.background.paper; - const arrowSx = getArrowSx(step.placement, bg); + const borderColor = alpha(theme.palette.primary.main, 0.5); + const arrowSx = getArrowSx(step.placement, bg, borderColor); return ( Date: Tue, 12 May 2026 11:39:14 -0400 Subject: [PATCH 097/361] feat: show snackbar on hub download finish/error --- DashAI/front/src/pages/hub/HubContent.jsx | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index a38f5daee..a84c6e694 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -25,6 +25,7 @@ import { import { enqueueHubDownloadJob } from "../../api/job"; import { startJobPolling } from "../../hooks/useJobPolling"; import { useTranslation } from "react-i18next"; +import { useSnackbar } from "notistack"; const SOURCE_ICONS = { HuggingFaceDatasetSource: HubIcon, @@ -33,6 +34,7 @@ const SOURCE_ICONS = { export default function HubContent() { const { t } = useTranslation(["hub"]); + const { enqueueSnackbar } = useSnackbar(); const navigate = useNavigate(); const { sourceName: sourceNameParam } = useParams(); const threePanelLayout = useThreePanelLayout({ storageKey: "hub" }); @@ -96,13 +98,22 @@ export default function HubContent() { if (watchedJobsRef.current.has(d.job_id)) continue; watchedJobsRef.current.add(d.job_id); - const refresh = async () => { + const onDone = async (isError) => { try { const updated = await getHubDownload(d.id); setDownloads((prev) => ({ ...prev, [`${updated.source_name}::${updated.dataset_id}`]: updated, })); + if (isError) { + enqueueSnackbar(t("hub:downloadFailed") + `: ${d.name}`, { + variant: "error", + }); + } else { + enqueueSnackbar(t("hub:downloaded") + `: ${d.name}`, { + variant: "success", + }); + } } catch { // ignore } finally { @@ -110,7 +121,11 @@ export default function HubContent() { } }; - startJobPolling(d.job_id, refresh, refresh); + startJobPolling( + d.job_id, + () => onDone(false), + () => onDone(true), + ); } }, [downloads]); From be8bfd5385a74f5f588ffdacf534eba3ef156c21 Mon Sep 17 00:00:00 2001 From: Cristian Tamblay Date: Tue, 12 May 2026 14:52:25 -0400 Subject: [PATCH 098/361] Added CPU limit to demo machine and added 2 more machine in dockercompose --- docker-compose.demo.yml | 76 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/docker-compose.demo.yml b/docker-compose.demo.yml index 16938be59..516c318e8 100644 --- a/docker-compose.demo.yml +++ b/docker-compose.demo.yml @@ -5,6 +5,11 @@ services: - "8001:8000" volumes: - dashai-data-1:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-2: image: dashai:latest @@ -12,6 +17,11 @@ services: - "8002:8000" volumes: - dashai-data-2:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-3: image: dashai:latest @@ -19,6 +29,11 @@ services: - "8003:8000" volumes: - dashai-data-3:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-4: image: dashai:latest @@ -26,6 +41,11 @@ services: - "8004:8000" volumes: - dashai-data-4:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-5: image: dashai:latest @@ -33,6 +53,11 @@ services: - "8005:8000" volumes: - dashai-data-5:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-6: image: dashai:latest @@ -40,6 +65,11 @@ services: - "8006:8000" volumes: - dashai-data-6:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-7: image: dashai:latest @@ -47,6 +77,11 @@ services: - "8007:8000" volumes: - dashai-data-7:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-8: image: dashai:latest @@ -54,6 +89,11 @@ services: - "8008:8000" volumes: - dashai-data-8:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-9: image: dashai:latest @@ -61,6 +101,11 @@ services: - "8009:8000" volumes: - dashai-data-9:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G dashai-10: image: dashai:latest @@ -68,6 +113,35 @@ services: - "8010:8000" volumes: - dashai-data-10:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G + + dashai-11: + image: dashai:latest + ports: + - "8011:8000" + volumes: + - dashai-data-11:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G + + dashai-12: + image: dashai:latest + ports: + - "8012:8000" + volumes: + - dashai-data-12:/root/.DashAI + deploy: + resources: + limits: + cpus: '2.5' + memory: 8G volumes: dashai-data-1: @@ -80,3 +154,5 @@ volumes: dashai-data-8: dashai-data-9: dashai-data-10: + dashai-data-11: + dashai-data-12: From 43242a942d31727d32000323b555d580dd6ee4e3 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:00:55 -0400 Subject: [PATCH 099/361] fix: convert numpy scalars to native Python in preview rows jsonable_encoder fails on numpy.int64/float32 values -- .item() converts any numpy scalar to its native Python equivalent. --- DashAI/back/job/predict_job.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/DashAI/back/job/predict_job.py b/DashAI/back/job/predict_job.py index 54ca2ac31..69fd411cc 100644 --- a/DashAI/back/job/predict_job.py +++ b/DashAI/back/job/predict_job.py @@ -49,11 +49,14 @@ def _build_preview_rows( """Build JSON-safe tabular rows for manual preview responses.""" columns = list(input_columns) + [output_col] + def _to_native(v: Any) -> Any: + return v.item() if hasattr(v, "item") else v + rows: List[List] = [] input_data = prepared_dataset.to_dict() for i in range(len(y_pred)): - row = [input_data[col][i] for col in input_columns] - row.append(y_pred[i]) + row = [_to_native(input_data[col][i]) for col in input_columns] + row.append(_to_native(y_pred[i])) rows.append(row) columns_json = jsonable_encoder(columns) From 50ac2ca3400cbd498c2c5e009483883281ecbdd3 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:03:54 -0400 Subject: [PATCH 100/361] fix: reuse single createEmptyRow call in createInitialRows Two separate calls each picked a different randomIndex, so the displayed row and the row sent to the backend diverged on first load. --- DashAI/front/src/components/predictions/ManualInputForm.jsx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/DashAI/front/src/components/predictions/ManualInputForm.jsx b/DashAI/front/src/components/predictions/ManualInputForm.jsx index 8e528419a..af07f3a48 100644 --- a/DashAI/front/src/components/predictions/ManualInputForm.jsx +++ b/DashAI/front/src/components/predictions/ManualInputForm.jsx @@ -33,8 +33,9 @@ export default function ManualInputForm({ if (manualInputData && manualInputData.length > 0) { return manualInputData; } - setManualInputData([createEmptyRow()]); - return [createEmptyRow()]; + const initialRow = createEmptyRow(); + setManualInputData([initialRow]); + return [initialRow]; } function createEmptyRow() { From ca3a0ec767923e5c41e8db4ce0f6582611bf29fe Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 12 May 2026 15:15:56 -0400 Subject: [PATCH 101/361] Add margin adjustments to tooltip based on placement and increase offset in TourProvider --- DashAI/front/src/components/tour/CustomTooltip.jsx | 10 ++++++++++ DashAI/front/src/components/tour/TourProvider.jsx | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/DashAI/front/src/components/tour/CustomTooltip.jsx b/DashAI/front/src/components/tour/CustomTooltip.jsx index cc67d2339..2acba5a23 100644 --- a/DashAI/front/src/components/tour/CustomTooltip.jsx +++ b/DashAI/front/src/components/tour/CustomTooltip.jsx @@ -72,6 +72,15 @@ export const CustomTooltip = ({ const borderColor = alpha(theme.palette.primary.main, 0.5); const arrowSx = getArrowSx(step.placement, bg, borderColor); + const getMarginSx = () => { + if (!step.placement) return {}; + if (step.placement.startsWith("top")) return { marginBottom: "7px" }; + if (step.placement.startsWith("bottom")) return { marginTop: "7px" }; + if (step.placement.startsWith("left")) return { marginRight: "7px" }; + if (step.placement.startsWith("right")) return { marginLeft: "7px" }; + return {}; + }; + return ( {arrowSx && } diff --git a/DashAI/front/src/components/tour/TourProvider.jsx b/DashAI/front/src/components/tour/TourProvider.jsx index 63209e4d6..cad747063 100644 --- a/DashAI/front/src/components/tour/TourProvider.jsx +++ b/DashAI/front/src/components/tour/TourProvider.jsx @@ -163,7 +163,7 @@ export const TourProvider = ({ floaterProps={{ disableFlip: true, hideArrow: true, - offset: 10, + offset: 18, }} /> {children} From 1a3be597b02ef93b10c15cf7011e3732f1a7a0ba Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 12 May 2026 15:19:27 -0400 Subject: [PATCH 102/361] Remove spotlightClicks option from datasets button in home tour steps --- DashAI/front/src/constants/tours/homeTour.js | 1 - 1 file changed, 1 deletion(-) diff --git a/DashAI/front/src/constants/tours/homeTour.js b/DashAI/front/src/constants/tours/homeTour.js index 3db448d09..9887cffd2 100644 --- a/DashAI/front/src/constants/tours/homeTour.js +++ b/DashAI/front/src/constants/tours/homeTour.js @@ -13,7 +13,6 @@ export const homeTourSteps = [ ), placement: "bottom", disableBeacon: true, - spotlightClicks: true, }, { target: '[data-tour="models-button"]', From ce5aa0ebeac1b63d96f6b76cc8bc0b692c03c1e5 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:33:24 -0400 Subject: [PATCH 103/361] refactor: rename HubDownload to Datafile across backend and frontend --- DashAI/back/api/api_v1/api.py | 4 +- .../{hub_download.py => datafile.py} | 70 ++++++------ .../api/api_v1/endpoints/dataset_source.py | 22 ++-- DashAI/back/core/enums/status.py | 2 +- DashAI/back/dependencies/database/models.py | 8 +- DashAI/back/job/datafile_job.py | 108 ++++++++++++++++++ DashAI/back/job/dataset_job.py | 20 ++-- DashAI/front/src/api/hub.ts | 30 +++-- DashAI/front/src/api/job.ts | 8 +- .../src/components/hub/DatasetDetail.jsx | 2 +- .../src/components/hub/HubImportPanel.jsx | 26 ++--- .../front/src/components/hub/HubLeftBar.jsx | 21 +++- DashAI/front/src/pages/hub/HubContent.jsx | 18 +-- 13 files changed, 227 insertions(+), 112 deletions(-) rename DashAI/back/api/api_v1/endpoints/{hub_download.py => datafile.py} (74%) create mode 100644 DashAI/back/job/datafile_job.py diff --git a/DashAI/back/api/api_v1/api.py b/DashAI/back/api/api_v1/api.py index 0695c38de..d3fd89fb9 100644 --- a/DashAI/back/api/api_v1/api.py +++ b/DashAI/back/api/api_v1/api.py @@ -2,6 +2,7 @@ from DashAI.back.api.api_v1.endpoints.components import router as components from DashAI.back.api.api_v1.endpoints.converters import router as converters +from DashAI.back.api.api_v1.endpoints.datafile import router as datafile_router from DashAI.back.api.api_v1.endpoints.dataset_source import router as dataset_source from DashAI.back.api.api_v1.endpoints.datasets import router as datasets from DashAI.back.api.api_v1.endpoints.explainers import router as explainers @@ -13,7 +14,6 @@ router as generative_session, ) from DashAI.back.api.api_v1.endpoints.hardware import router as hardware -from DashAI.back.api.api_v1.endpoints.hub_download import router as hub_download from DashAI.back.api.api_v1.endpoints.jobs import router as jobs from DashAI.back.api.api_v1.endpoints.metrics import router as metrics from DashAI.back.api.api_v1.endpoints.model_sessions import router as model_sessions @@ -43,4 +43,4 @@ api_router_v1.include_router(hardware, prefix="/hardware") api_router_v1.include_router(scoring, prefix="/scoring") api_router_v1.include_router(dataset_source, prefix="/dataset-source") -api_router_v1.include_router(hub_download, prefix="/hub-download") +api_router_v1.include_router(datafile_router, prefix="/datafile") diff --git a/DashAI/back/api/api_v1/endpoints/hub_download.py b/DashAI/back/api/api_v1/endpoints/datafile.py similarity index 74% rename from DashAI/back/api/api_v1/endpoints/hub_download.py rename to DashAI/back/api/api_v1/endpoints/datafile.py index 8f9fefb13..fbad92aaa 100644 --- a/DashAI/back/api/api_v1/endpoints/hub_download.py +++ b/DashAI/back/api/api_v1/endpoints/datafile.py @@ -1,4 +1,4 @@ -"""Hub download management endpoints.""" +"""Datafile management endpoints.""" import logging import os @@ -11,8 +11,8 @@ from pydantic import BaseModel from sqlalchemy import exc -from DashAI.back.core.enums.status import HubDownloadStatus -from DashAI.back.dependencies.database.models import HubDownload +from DashAI.back.core.enums.status import DatafileStatus +from DashAI.back.dependencies.database.models import Datafile if TYPE_CHECKING: from sqlalchemy.orm import sessionmaker @@ -23,7 +23,7 @@ router = APIRouter() -def _row_to_dict(row: HubDownload) -> Dict[str, Any]: +def _row_to_dict(row: Datafile) -> Dict[str, Any]: return { "id": row.id, "source_name": row.source_name, @@ -47,9 +47,9 @@ class CreateDownloadRequest(BaseModel): async def list_downloads( session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), ) -> List[Dict[str, Any]]: - """Return all hub download records.""" + """Return all datafile records.""" with session_factory() as db: - rows = db.query(HubDownload).order_by(HubDownload.created.desc()).all() + rows = db.query(Datafile).order_by(Datafile.created.desc()).all() return [_row_to_dict(r) for r in rows] @@ -60,12 +60,12 @@ async def create_download( registry: "ComponentRegistry" = Depends(lambda: di["component_registry"]), job_queue=Depends(lambda: di["job_queue"]), ) -> Dict[str, Any]: - """Create a HubDownload record and enqueue the download job. + """Create a Datafile record and enqueue the download job. If a record for (source_name, dataset_id) already exists and its status is READY, it is returned immediately without re-downloading. """ - from DashAI.back.job.hub_download_job import HubDownloadJob + from DashAI.back.job.datafile_job import DatafileJob sources = registry._registry.get("DatasetSource", {}) if body.source_name not in sources: @@ -76,20 +76,20 @@ async def create_download( with session_factory() as db: existing = ( - db.query(HubDownload) + db.query(Datafile) .filter( - HubDownload.source_name == body.source_name, - HubDownload.dataset_id == body.dataset_id, + Datafile.source_name == body.source_name, + Datafile.dataset_id == body.dataset_id, ) .first() ) if existing is not None: - if existing.status == HubDownloadStatus.READY: + if existing.status == DatafileStatus.READY: return _row_to_dict(existing) - if existing.status == HubDownloadStatus.DOWNLOADING: + if existing.status == DatafileStatus.DOWNLOADING: return _row_to_dict(existing) # ERROR — allow retry: reset to downloading - existing.status = HubDownloadStatus.DOWNLOADING + existing.status = DatafileStatus.DOWNLOADING existing.error_message = None existing.local_path = None existing.name = body.name @@ -104,11 +104,11 @@ async def create_download( ) from e row = existing else: - row = HubDownload( + row = Datafile( source_name=body.source_name, dataset_id=body.dataset_id, name=body.name, - status=HubDownloadStatus.DOWNLOADING, + status=DatafileStatus.DOWNLOADING, ) db.add(row) try: @@ -121,12 +121,12 @@ async def create_download( detail="DB error creating download record.", ) from e - hub_download_id = row.id + datafile_id = row.id result_dict = _row_to_dict(row) - job = HubDownloadJob( + job = DatafileJob( kwargs={ - "hub_download_id": hub_download_id, + "datafile_id": datafile_id, "source_name": body.source_name, "dataset_source_id": body.dataset_id, } @@ -137,36 +137,36 @@ async def create_download( return result_dict -@router.get("/{hub_download_id}", response_model=Dict[str, Any]) +@router.get("/{datafile_id}", response_model=Dict[str, Any]) async def get_download( - hub_download_id: int, + datafile_id: int, session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), ) -> Dict[str, Any]: - """Return a single hub download record by id.""" + """Return a single datafile record by id.""" with session_factory() as db: - row = db.get(HubDownload, hub_download_id) + row = db.get(Datafile, datafile_id) if row is None: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"HubDownload {hub_download_id} not found.", + detail=f"Datafile {datafile_id} not found.", ) return _row_to_dict(row) -@router.delete("/{hub_download_id}", status_code=status.HTTP_204_NO_CONTENT) +@router.delete("/{datafile_id}", status_code=status.HTTP_204_NO_CONTENT) async def delete_download( - hub_download_id: int, + datafile_id: int, session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), ) -> None: - """Delete a hub download record and its cached files.""" + """Delete a datafile record and its cached files.""" import shutil with session_factory() as db: - row = db.get(HubDownload, hub_download_id) + row = db.get(Datafile, datafile_id) if row is None: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"HubDownload {hub_download_id} not found.", + detail=f"Datafile {datafile_id} not found.", ) local_path = row.local_path try: @@ -183,20 +183,20 @@ async def delete_download( shutil.rmtree(local_path, ignore_errors=True) -@router.get("/{hub_download_id}/files", response_model=List[str]) +@router.get("/{datafile_id}/files", response_model=List[str]) async def list_files( - hub_download_id: int, + datafile_id: int, session_factory: "sessionmaker" = Depends(lambda: di["session_factory"]), ) -> List[str]: - """Return the list of files in a ready hub download directory.""" + """Return the list of files in a ready datafile directory.""" with session_factory() as db: - row = db.get(HubDownload, hub_download_id) + row = db.get(Datafile, datafile_id) if row is None: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"HubDownload {hub_download_id} not found.", + detail=f"Datafile {datafile_id} not found.", ) - if row.status != HubDownloadStatus.READY or not row.local_path: + if row.status != DatafileStatus.READY or not row.local_path: raise HTTPException( status_code=status.HTTP_409_CONFLICT, detail="Download is not ready yet.", diff --git a/DashAI/back/api/api_v1/endpoints/dataset_source.py b/DashAI/back/api/api_v1/endpoints/dataset_source.py index 024f9f8f9..00c70dfd0 100644 --- a/DashAI/back/api/api_v1/endpoints/dataset_source.py +++ b/DashAI/back/api/api_v1/endpoints/dataset_source.py @@ -180,16 +180,16 @@ class PreviewRequest(BaseModel): DataLoader parameters (e.g., separator for CSV). n_rows : int Number of rows to sample (1-500). - hub_download_id : int | None + datafile_id : int | None If set, use this pre-downloaded local file instead of fetching from source. selected_file : str | None - Relative filename inside the hub download directory. + Relative filename inside the datafile directory. """ dataloader: str | None = None params: Dict[str, Any] = {} n_rows: int = 100 - hub_download_id: int | None = None + datafile_id: int | None = None selected_file: str | None = None @@ -213,11 +213,11 @@ async def preview_dataset_with_params( dataset_id : str Source-specific dataset identifier (URL-encoded). body : PreviewRequest - DataLoader name, params, row count, and optional hub_download_id. + DataLoader name, params, row count, and optional datafile_id. registry : ComponentRegistry Injected component registry. session_factory - Injected DB session factory (used when hub_download_id is set). + Injected DB session factory (used when datafile_id is set). Returns ------- @@ -229,18 +229,18 @@ async def preview_dataset_with_params( n_rows = max(1, min(body.n_rows, 500)) try: - if body.hub_download_id is None: + if body.datafile_id is None: raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail="hub_download_id is required.", + detail="datafile_id is required.", ) - from DashAI.back.core.enums.status import HubDownloadStatus - from DashAI.back.dependencies.database.models import HubDownload + from DashAI.back.core.enums.status import DatafileStatus + from DashAI.back.dependencies.database.models import Datafile with session_factory() as db: - hub_row = db.get(HubDownload, body.hub_download_id) - if hub_row is None or hub_row.status != HubDownloadStatus.READY: + hub_row = db.get(Datafile, body.datafile_id) + if hub_row is None or hub_row.status != DatafileStatus.READY: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail="Hub download not ready or not found.", diff --git a/DashAI/back/core/enums/status.py b/DashAI/back/core/enums/status.py index 5fa493058..a21973878 100644 --- a/DashAI/back/core/enums/status.py +++ b/DashAI/back/core/enums/status.py @@ -55,7 +55,7 @@ class PredictionStatus(Enum): ERROR = 4 -class HubDownloadStatus(Enum): +class DatafileStatus(Enum): DOWNLOADING = "downloading" READY = "ready" ERROR = "error" diff --git a/DashAI/back/dependencies/database/models.py b/DashAI/back/dependencies/database/models.py index c4b7ae1af..677433306 100644 --- a/DashAI/back/dependencies/database/models.py +++ b/DashAI/back/dependencies/database/models.py @@ -22,10 +22,10 @@ from DashAI.back.core.enums.plugin_tags import PluginTag from DashAI.back.core.enums.status import ( ConverterStatus, + DatafileStatus, DatasetStatus, ExplainerStatus, ExplorerStatus, - HubDownloadStatus, PluginStatus, PredictionStatus, RunStatus, @@ -720,7 +720,7 @@ def delete_result(self) -> None: self.end_time = None -class HubDownload(Base): +class Datafile(Base): __tablename__ = "datafile" id: Mapped[int] = mapped_column(primary_key=True) @@ -729,9 +729,9 @@ class HubDownload(Base): name: Mapped[str] = mapped_column(String, nullable=False) local_path: Mapped[str] = mapped_column(String, nullable=True) status: Mapped[Enum] = mapped_column( - Enum(HubDownloadStatus), + Enum(DatafileStatus), nullable=False, - default=HubDownloadStatus.DOWNLOADING, + default=DatafileStatus.DOWNLOADING, ) error_message: Mapped[str] = mapped_column(String, nullable=True) created: Mapped[DateTime] = mapped_column(DateTime, default=datetime.now) diff --git a/DashAI/back/job/datafile_job.py b/DashAI/back/job/datafile_job.py new file mode 100644 index 000000000..ddcc9b852 --- /dev/null +++ b/DashAI/back/job/datafile_job.py @@ -0,0 +1,108 @@ +"""Job for downloading a dataset from an external hub source.""" + +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +from kink import di, inject +from sqlalchemy import exc + +from DashAI.back.core.enums.status import DatafileStatus +from DashAI.back.dependencies.database.models import Datafile +from DashAI.back.job.base_job import BaseJob, JobError + +if TYPE_CHECKING: + from sqlalchemy.orm import sessionmaker + +log = logging.getLogger(__name__) + + +class DatafileJob(BaseJob): + """Job that fetches a dataset file from an external hub source. + + Parameters + ---------- + kwargs : dict + - datafile_id: int — DB row id + - source_name: str — DatasetSource class name + - dataset_source_id: str — source-specific dataset identifier + """ + + @inject + def set_status_as_delivered( + self, session_factory: "sessionmaker" = lambda di: di["session_factory"] + ) -> None: + """No-op: datafile downloads don't use the delivered state.""" + + @inject + def set_status_as_error( + self, session_factory: "sessionmaker" = lambda di: di["session_factory"] + ) -> None: + datafile_id: int = self.kwargs["datafile_id"] + error_msg: str = self.kwargs.get("_error_message", "") + with session_factory() as db: + row: Datafile = db.get(Datafile, datafile_id) + if row is not None: + row.status = DatafileStatus.ERROR + row.error_message = error_msg + try: + db.commit() + except exc.SQLAlchemyError as e: + log.exception(e) + + def get_job_name(self) -> str: + return f"Hub download: {self.kwargs.get('dataset_source_id', '')}" + + @inject + def run(self) -> None: + import shutil + + component_registry = di["component_registry"] + session_factory = di["session_factory"] + config = di["config"] + + datafile_id: int = self.kwargs["datafile_id"] + source_name: str = self.kwargs["source_name"] + dataset_source_id: str = self.kwargs["dataset_source_id"] + + download_dir: Path = config["DATAFILE_PATH"] / str(datafile_id) + + try: + sources = component_registry._registry.get("DatasetSource", {}) + if source_name not in sources: + raise JobError(f"DatasetSource '{source_name}' not found in registry.") + + download_dir.mkdir(parents=True, exist_ok=True) + source = sources[source_name]["class"]() + file_path = source.download_dataset(dataset_source_id, str(download_dir)) + log.debug("Hub dataset '%s' downloaded to %s", dataset_source_id, file_path) + + with session_factory() as db: + row: Datafile = db.get(Datafile, datafile_id) + if row is None: + raise JobError(f"Datafile row {datafile_id} not found.") + row.status = DatafileStatus.READY + row.local_path = str(download_dir) + try: + db.commit() + except exc.SQLAlchemyError as e: + log.exception(e) + raise JobError("DB error saving download path.") from e + + log.debug("Datafile download job %d completed.", datafile_id) + + except JobError as e: + log.error("Datafile download job %d failed: %s", datafile_id, e) + self.kwargs["_error_message"] = str(e) + with session_factory() as db: + row = db.get(Datafile, datafile_id) + if row is not None: + row.status = DatafileStatus.ERROR + row.error_message = str(e) + try: + db.commit() + except exc.SQLAlchemyError as db_err: + log.exception(db_err) + if download_dir.exists(): + shutil.rmtree(download_dir, ignore_errors=True) + raise diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index 897078363..ab9052a5a 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -166,25 +166,21 @@ def run( if source_name: # --- Hub import path --- - from DashAI.back.core.enums.status import HubDownloadStatus + from DashAI.back.core.enums.status import DatafileStatus from DashAI.back.dependencies.database.models import ( - HubDownload, + Datafile, ) - hub_download_id = params.get("hub_download_id") + datafile_id = params.get("datafile_id") selected_file = params.get("selected_file") - if hub_download_id is None: - raise JobError( - "hub_download_id is required for hub imports." - ) + if datafile_id is None: + raise JobError("datafile_id is required for hub imports.") with session_factory() as db: - hub_row = db.get(HubDownload, hub_download_id) - if hub_row is None or hub_row.status != HubDownloadStatus.READY: - raise JobError( - f"HubDownload {hub_download_id} is not ready." - ) + hub_row = db.get(Datafile, datafile_id) + if hub_row is None or hub_row.status != DatafileStatus.READY: + raise JobError(f"Datafile {datafile_id} is not ready.") hub_work_dir = hub_row.local_path if selected_file: file_path_hub = str(Path(hub_work_dir) / selected_file) diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index f067ec8ff..2d4b52600 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -77,7 +77,7 @@ export const previewHubDataset = async ( dataloader, params: params ?? {}, n_rows: nRows, - hub_download_id: hubDownloadId ?? null, + datafile_id: hubDownloadId ?? null, selected_file: selectedFile ?? null, }, ); @@ -98,42 +98,40 @@ export const importHubDataset = async ( return response.data; }; -// ---- Hub Downloads ---- +// ---- Datafiles ---- -const hubDownloadEndpoint = "/v1/hub-download"; +const datafileEndpoint = "/v1/datafile"; -export type HubDownloadStatus = "downloading" | "ready" | "error"; +export type DatafileStatus = "downloading" | "ready" | "error"; -export interface HubDownload { +export interface Datafile { id: number; source_name: string; dataset_id: string; name: string; local_path: string | null; - status: HubDownloadStatus; + status: DatafileStatus; error_message: string | null; created: string | null; last_modified: string | null; job_id?: string; } -export const listHubDownloads = async (): Promise => { - const response = await api.get(`${hubDownloadEndpoint}/`); +export const listDatafiles = async (): Promise => { + const response = await api.get(`${datafileEndpoint}/`); return response.data; }; -export const getHubDownload = async (id: number): Promise => { - const response = await api.get(`${hubDownloadEndpoint}/${id}`); +export const getDatafile = async (id: number): Promise => { + const response = await api.get(`${datafileEndpoint}/${id}`); return response.data; }; -export const deleteHubDownload = async (id: number): Promise => { - await api.delete(`${hubDownloadEndpoint}/${id}`); +export const deleteDatafile = async (id: number): Promise => { + await api.delete(`${datafileEndpoint}/${id}`); }; -export const listHubDownloadFiles = async (id: number): Promise => { - const response = await api.get( - `${hubDownloadEndpoint}/${id}/files`, - ); +export const listDatafileFiles = async (id: number): Promise => { + const response = await api.get(`${datafileEndpoint}/${id}/files`); return response.data; }; diff --git a/DashAI/front/src/api/job.ts b/DashAI/front/src/api/job.ts index 03e89d812..49d6d3514 100644 --- a/DashAI/front/src/api/job.ts +++ b/DashAI/front/src/api/job.ts @@ -1,5 +1,5 @@ import api from "./api"; -import type { HubDownload } from "./hub"; +import type { Datafile } from "./hub"; export const isQueueEmpty = async (): Promise => { const response = await api.get<{ is_empty: boolean }>("/v1/job/is_empty"); @@ -237,12 +237,12 @@ export const enqueuePipelineJob = async ( return response.data; }; -export const enqueueHubDownloadJob = async ( +export const enqueueDatafileJob = async ( source_name: string, dataset_id: string, name: string, -): Promise => { - const response = await api.post("/v1/hub-download/", { +): Promise => { + const response = await api.post("/v1/datafile/", { source_name, dataset_id, name, diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index d5616a17d..597fec103 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -26,7 +26,7 @@ import LoadingDots from "../shared/LoadingDots"; * * @param {object|null} dataset - Selected DatasetEntry, or null if none. * @param {string|null} sourceName - Active DatasetSource class name. - * @param {object|null} download - HubDownload record for this dataset (if any). + * @param {object|null} download - Datafile record for this dataset (if any). * @param {boolean} downloadLoading - True while the download record is being created. * @param {function} onStartDownload - Called when user clicks "Download to DashAI". * @param {function} onStartImport - Called when download is ready and user clicks "Add to DashAI". diff --git a/DashAI/front/src/components/hub/HubImportPanel.jsx b/DashAI/front/src/components/hub/HubImportPanel.jsx index 961d7a6da..96ee64739 100644 --- a/DashAI/front/src/components/hub/HubImportPanel.jsx +++ b/DashAI/front/src/components/hub/HubImportPanel.jsx @@ -20,7 +20,7 @@ import { useNavigate } from "react-router-dom"; import { createDataset } from "../../api/datasets"; import { importHubDataset, - listHubDownloadFiles, + listDatafileFiles, previewHubDataset, } from "../../api/hub"; import { getComponents } from "../../api/component"; @@ -30,13 +30,13 @@ import PreviewDataset from "../notebooks/datasetCreation/PreviewDataset"; /** * Full-page import panel for Hub datasets. * - * Without hubDownload: step 0 = dataloader, step 1 = preview - * With hubDownload: step 0 = file select, step 1 = dataloader, step 2 = preview + * Without datafile: step 0 = dataloader, step 1 = preview + * With datafile: step 0 = file select, step 1 = dataloader, step 2 = preview */ export default function HubImportPanel({ dataset, sourceName, - hubDownload = null, + datafile = null, step, onStepChange, selectedLoader, @@ -55,8 +55,8 @@ export default function HubImportPanel({ const setStepValue = onStepChange ?? setLocalStep; const selectedValue = selectedLoader ?? localSelectedLoader; - // Whether hub download flow adds an extra file-select step at position 0 - const hasFileStep = hubDownload != null; + // Whether datafile flow adds an extra file-select step at position 0 + const hasFileStep = datafile != null; // Adjusted step indices for dataloader / preview const dataloaderStep = hasFileStep ? 1 : 0; const previewStep = hasFileStep ? 2 : 1; @@ -115,10 +115,10 @@ export default function HubImportPanel({ // Load files when entering file-select step useEffect(() => { - if (!hasFileStep || stepValue !== 0 || !hubDownload) return; + if (!hasFileStep || stepValue !== 0 || !datafile) return; let isMounted = true; setLoadingFiles(true); - listHubDownloadFiles(hubDownload.id) + listDatafileFiles(datafile.id) .then((f) => { if (!isMounted) return; setFiles(f); @@ -133,7 +133,7 @@ export default function HubImportPanel({ return () => { isMounted = false; }; - }, [hasFileStep, stepValue, hubDownload?.id]); + }, [hasFileStep, stepValue, datafile?.id]); // Preview useEffect(() => { @@ -158,7 +158,7 @@ export default function HubImportPanel({ effectiveRows, selectedValue?.name, formValues, - hubDownload?.id, + datafile?.id, selectedFile ?? undefined, ) .then((data) => { @@ -184,7 +184,7 @@ export default function HubImportPanel({ dataset?.id, sourceName, selectedValue?.name, - hubDownload?.id, + datafile?.id, selectedFile, JSON.stringify(formValues || {}), ]); @@ -204,8 +204,8 @@ export default function HubImportPanel({ inferred_types: columnTypes, column_renames: columnRenames, }; - if (hubDownload) { - importParams.hub_download_id = hubDownload.id; + if (datafile) { + importParams.datafile_id = datafile.id; if (selectedFile) importParams.selected_file = selectedFile; } await importHubDataset(sourceName, dataset.id, created.id, importParams); diff --git a/DashAI/front/src/components/hub/HubLeftBar.jsx b/DashAI/front/src/components/hub/HubLeftBar.jsx index 7cbcee3bc..9b021d740 100644 --- a/DashAI/front/src/components/hub/HubLeftBar.jsx +++ b/DashAI/front/src/components/hub/HubLeftBar.jsx @@ -1,5 +1,12 @@ import { useState } from "react"; -import { Box, Collapse, Divider, IconButton, Tooltip, Typography } from "@mui/material"; +import { + Box, + Collapse, + Divider, + IconButton, + Tooltip, + Typography, +} from "@mui/material"; import AddIcon from "@mui/icons-material/Add"; import DeleteIcon from "@mui/icons-material/Delete"; import KeyboardArrowDownIcon from "@mui/icons-material/KeyboardArrowDown"; @@ -22,7 +29,9 @@ function SectionHeader({ icon: Icon, title, count, open, onToggle }) { py: 0.5, px: 1, borderRadius: 1, - "&:hover": { bgcolor: theme.palette.ui?.hover ?? theme.palette.action.hover }, + "&:hover": { + bgcolor: theme.palette.ui?.hover ?? theme.palette.action.hover, + }, }} onClick={onToggle} > @@ -72,11 +81,15 @@ function SectionHeader({ icon: Icon, title, count, open, onToggle }) { /** * Left sidebar for the Hub module — shows downloaded datasets only. * - * @param {Array} downloads - List of HubDownload records to show. + * @param {Array} downloads - List of Datafile records to show. * @param {function} onDeleteDownload - Called with download id when user deletes. * @param {function} onImportDownload - Called with download record when user clicks Add. */ -export default function HubLeftBar({ downloads = [], onDeleteDownload, onImportDownload }) { +export default function HubLeftBar({ + downloads = [], + onDeleteDownload, + onImportDownload, +}) { const { t } = useTranslation(["hub", "common"]); const theme = useTheme(); const [searchQuery, setSearchQuery] = useState(""); diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index a84c6e694..5bb361c75 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -17,12 +17,12 @@ import HubImportPanel from "../../components/hub/HubImportPanel"; import ComponentDetailsPanel from "../../components/custom/ComponentDetailsPanel"; import DataloaderConfigBar from "../../components/notebooks/datasetCreation/DataloaderConfigBar"; import { - deleteHubDownload, + deleteDatafile, getDatasetSources, - getHubDownload, - listHubDownloads, + getDatafile, + listDatafiles, } from "../../api/hub"; -import { enqueueHubDownloadJob } from "../../api/job"; +import { enqueueDatafileJob } from "../../api/job"; import { startJobPolling } from "../../hooks/useJobPolling"; import { useTranslation } from "react-i18next"; import { useSnackbar } from "notistack"; @@ -80,7 +80,7 @@ export default function HubContent() { }, [sourceNameParam]); useEffect(() => { - listHubDownloads() + listDatafiles() .then((rows) => { const map = {}; for (const r of rows) map[`${r.source_name}::${r.dataset_id}`] = r; @@ -100,7 +100,7 @@ export default function HubContent() { const onDone = async (isError) => { try { - const updated = await getHubDownload(d.id); + const updated = await getDatafile(d.id); setDownloads((prev) => ({ ...prev, [`${updated.source_name}::${updated.dataset_id}`]: updated, @@ -141,7 +141,7 @@ export default function HubContent() { if (!selectedDataset || !sourceNameParam) return; setDownloadLoading(true); try { - const row = await enqueueHubDownloadJob( + const row = await enqueueDatafileJob( sourceNameParam, selectedDataset.id, selectedDataset.name, @@ -159,7 +159,7 @@ export default function HubContent() { const handleDeleteDownload = async (downloadId) => { try { - await deleteHubDownload(downloadId); + await deleteDatafile(downloadId); setDownloads((prev) => { const next = { ...prev }; for (const key of Object.keys(next)) { @@ -246,7 +246,7 @@ export default function HubContent() { : selectedDataset } sourceName={importSourceName} - hubDownload={importDownload} + datafile={importDownload} step={importStep} onStepChange={setImportStep} selectedLoader={selectedDataloader} From 118d2cacfe68fae699e96a3fe2794dba3ca53b00 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:35:57 -0400 Subject: [PATCH 104/361] refactor: remove deprecated HubDownloadJob class and related functionality --- DashAI/back/job/hub_download_job.py | 108 ---------------------------- 1 file changed, 108 deletions(-) delete mode 100644 DashAI/back/job/hub_download_job.py diff --git a/DashAI/back/job/hub_download_job.py b/DashAI/back/job/hub_download_job.py deleted file mode 100644 index 0c8200b12..000000000 --- a/DashAI/back/job/hub_download_job.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Job for downloading a dataset from an external hub source.""" - -import logging -from pathlib import Path -from typing import TYPE_CHECKING - -from kink import di, inject -from sqlalchemy import exc - -from DashAI.back.core.enums.status import HubDownloadStatus -from DashAI.back.dependencies.database.models import HubDownload -from DashAI.back.job.base_job import BaseJob, JobError - -if TYPE_CHECKING: - from sqlalchemy.orm import sessionmaker - -log = logging.getLogger(__name__) - - -class HubDownloadJob(BaseJob): - """Job that fetches a dataset file from an external hub source. - - Parameters - ---------- - kwargs : dict - - hub_download_id: int — DB row id - - source_name: str — DatasetSource class name - - dataset_source_id: str — source-specific dataset identifier - """ - - @inject - def set_status_as_delivered( - self, session_factory: "sessionmaker" = lambda di: di["session_factory"] - ) -> None: - """No-op: hub downloads don't use the delivered state.""" - - @inject - def set_status_as_error( - self, session_factory: "sessionmaker" = lambda di: di["session_factory"] - ) -> None: - hub_download_id: int = self.kwargs["hub_download_id"] - error_msg: str = self.kwargs.get("_error_message", "") - with session_factory() as db: - row: HubDownload = db.get(HubDownload, hub_download_id) - if row is not None: - row.status = HubDownloadStatus.ERROR - row.error_message = error_msg - try: - db.commit() - except exc.SQLAlchemyError as e: - log.exception(e) - - def get_job_name(self) -> str: - return f"Hub download: {self.kwargs.get('dataset_source_id', '')}" - - @inject - def run(self) -> None: - import shutil - - component_registry = di["component_registry"] - session_factory = di["session_factory"] - config = di["config"] - - hub_download_id: int = self.kwargs["hub_download_id"] - source_name: str = self.kwargs["source_name"] - dataset_source_id: str = self.kwargs["dataset_source_id"] - - download_dir: Path = config["DATAFILE_PATH"] / str(hub_download_id) - - try: - sources = component_registry._registry.get("DatasetSource", {}) - if source_name not in sources: - raise JobError(f"DatasetSource '{source_name}' not found in registry.") - - download_dir.mkdir(parents=True, exist_ok=True) - source = sources[source_name]["class"]() - file_path = source.download_dataset(dataset_source_id, str(download_dir)) - log.debug("Hub dataset '%s' downloaded to %s", dataset_source_id, file_path) - - with session_factory() as db: - row: HubDownload = db.get(HubDownload, hub_download_id) - if row is None: - raise JobError(f"HubDownload row {hub_download_id} not found.") - row.status = HubDownloadStatus.READY - row.local_path = str(download_dir) - try: - db.commit() - except exc.SQLAlchemyError as e: - log.exception(e) - raise JobError("DB error saving download path.") from e - - log.debug("Hub download job %d completed.", hub_download_id) - - except JobError as e: - log.error("Hub download job %d failed: %s", hub_download_id, e) - self.kwargs["_error_message"] = str(e) - with session_factory() as db: - row = db.get(HubDownload, hub_download_id) - if row is not None: - row.status = HubDownloadStatus.ERROR - row.error_message = str(e) - try: - db.commit() - except exc.SQLAlchemyError as db_err: - log.exception(db_err) - if download_dir.exists(): - shutil.rmtree(download_dir, ignore_errors=True) - raise From 381f97ab526bf94bdbec21967d110f49b45ffe01 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:37:33 -0400 Subject: [PATCH 105/361] fix: catch all exceptions in DatafileJob, wrap HF load errors as RuntimeError --- .../dataset_sources/huggingface_dataset_source.py | 4 +++- DashAI/back/job/datafile_job.py | 13 ++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/DashAI/back/dataset_sources/huggingface_dataset_source.py b/DashAI/back/dataset_sources/huggingface_dataset_source.py index 3c730099a..9339b3fec 100644 --- a/DashAI/back/dataset_sources/huggingface_dataset_source.py +++ b/DashAI/back/dataset_sources/huggingface_dataset_source.py @@ -182,7 +182,9 @@ def _load_split(ds) -> pd.DataFrame: return out_path except ValueError as exc: if "Config name is missing" not in str(exc): - raise + raise RuntimeError( + f"Cannot load dataset '{dataset_id}': {exc}" + ) from exc configs = get_dataset_config_names(dataset_id) log.debug("%s has %d configs, downloading all", dataset_id, len(configs)) diff --git a/DashAI/back/job/datafile_job.py b/DashAI/back/job/datafile_job.py index ddcc9b852..2459b3167 100644 --- a/DashAI/back/job/datafile_job.py +++ b/DashAI/back/job/datafile_job.py @@ -91,18 +91,21 @@ def run(self) -> None: log.debug("Datafile download job %d completed.", datafile_id) - except JobError as e: - log.error("Datafile download job %d failed: %s", datafile_id, e) - self.kwargs["_error_message"] = str(e) + except Exception as e: + err_msg = str(e) + log.error("Datafile download job %d failed: %s", datafile_id, err_msg) + self.kwargs["_error_message"] = err_msg with session_factory() as db: row = db.get(Datafile, datafile_id) if row is not None: row.status = DatafileStatus.ERROR - row.error_message = str(e) + row.error_message = err_msg try: db.commit() except exc.SQLAlchemyError as db_err: log.exception(db_err) if download_dir.exists(): shutil.rmtree(download_dir, ignore_errors=True) - raise + if isinstance(e, JobError): + raise + raise JobError(err_msg) from e From 512930c1c393c4132c948b70476641d367edf975 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:44:14 -0400 Subject: [PATCH 106/361] fix: call set_status_as_error in DatafileJob on failure --- DashAI/back/job/datafile_job.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/DashAI/back/job/datafile_job.py b/DashAI/back/job/datafile_job.py index 2459b3167..4aa563212 100644 --- a/DashAI/back/job/datafile_job.py +++ b/DashAI/back/job/datafile_job.py @@ -95,15 +95,7 @@ def run(self) -> None: err_msg = str(e) log.error("Datafile download job %d failed: %s", datafile_id, err_msg) self.kwargs["_error_message"] = err_msg - with session_factory() as db: - row = db.get(Datafile, datafile_id) - if row is not None: - row.status = DatafileStatus.ERROR - row.error_message = err_msg - try: - db.commit() - except exc.SQLAlchemyError as db_err: - log.exception(db_err) + self.set_status_as_error() if download_dir.exists(): shutil.rmtree(download_dir, ignore_errors=True) if isinstance(e, JobError): From 1b69596af886a610cf0b828384597212ce3b666b Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 12 May 2026 15:51:18 -0400 Subject: [PATCH 107/361] Refactor SelectDataloaderStep to simplify navigation and integrate tour data for CSV components --- .../components/custom/ComponentSelector.jsx | 5 +++ .../datasetCreation/SelectDataloaderStep.jsx | 31 ++----------------- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/DashAI/front/src/components/custom/ComponentSelector.jsx b/DashAI/front/src/components/custom/ComponentSelector.jsx index e2b2ad2bb..6fd3610a4 100644 --- a/DashAI/front/src/components/custom/ComponentSelector.jsx +++ b/DashAI/front/src/components/custom/ComponentSelector.jsx @@ -37,6 +37,7 @@ function ComponentSelector({ searchPlaceholder, emptyText, getIcon, + tourDataFor = null, }) { const { t } = useTranslation("custom"); const [search, setSearch] = useState(""); @@ -195,11 +196,15 @@ function ComponentSelector({ {items.map((component) => { const isSelected = selected?.name === component.name; const icon = getIcon?.(component); + const isCsvComponent = + tourDataFor && + component.name.toLowerCase().includes("csv"); return ( handleSelect(component)} + data-tour={isCsvComponent ? tourDataFor : undefined} sx={{ p: 1.5, display: "flex", diff --git a/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx b/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx index 80cd7716e..21fabf7b4 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx @@ -1,4 +1,3 @@ -import { useEffect } from "react"; import ComponentSelector from "../../custom/ComponentSelector"; import { Box, CircularProgress, Stack } from "@mui/material"; import { useTourContext } from "../../tour/TourProvider"; @@ -26,37 +25,12 @@ export default function SelectDataloaderStep({ const { t } = useTranslation(["datasets", "common"]); const handleNext = () => { - if (tourContext?.run) { - goToNextStep(); - const observer = new MutationObserver(() => { - if (document.querySelector('[data-tour="upload-area"]')) { - observer.disconnect(); - tourContext.nextStep(); - } - }); - observer.observe(document.body, { childList: true, subtree: true }); - } else { - goToNextStep(); - } + goToNextStep(); }; - useEffect(() => { - if (!loadingDataloaders && tourContext?.run) { - setTimeout(() => { - const cards = document.querySelectorAll('[role="button"]'); - cards.forEach((card) => { - const cardText = card.textContent; - if (cardText.includes("CSVDataLoader") || cardText.includes("CSV")) { - card.setAttribute("data-tour", "csv-dataloader-option"); - } - }); - }, 100); - } - }, [loadingDataloaders, tourContext]); - return ( - + {loadingDataloaders ? ( )} From b3260b68fdc5ec7ef8bba1adc19eac3ecf559b51 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:53:19 -0400 Subject: [PATCH 108/361] fix: show check queue hint in download error snackbar --- DashAI/front/src/pages/hub/HubContent.jsx | 7 ++++--- DashAI/front/src/utils/i18n/locales/en/hub.json | 3 ++- DashAI/front/src/utils/i18n/locales/es/hub.json | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index 5bb361c75..dd943f38d 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -106,9 +106,10 @@ export default function HubContent() { [`${updated.source_name}::${updated.dataset_id}`]: updated, })); if (isError) { - enqueueSnackbar(t("hub:downloadFailed") + `: ${d.name}`, { - variant: "error", - }); + enqueueSnackbar( + `${t("hub:downloadFailed")}: ${d.name} - ${t("hub:checkQueue")}`, + { variant: "error" }, + ); } else { enqueueSnackbar(t("hub:downloaded") + `: ${d.name}`, { variant: "success", diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json index ad26c9f64..bb65b7616 100644 --- a/DashAI/front/src/utils/i18n/locales/en/hub.json +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -54,5 +54,6 @@ "fromSource": "From {{source}}", "selectSourceSubtitle": "Select a source to browse and import datafiles.", "searchDownloads": "Search datafiles...", - "datasetDetails": "Datafile Details" + "datasetDetails": "Datafile Details", + "checkQueue": "Check the job queue for details" } diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json index 261624eef..099ea52be 100644 --- a/DashAI/front/src/utils/i18n/locales/es/hub.json +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -54,5 +54,6 @@ "fromSource": "De {{source}}", "selectSourceSubtitle": "Selecciona una fuente para explorar e importar datafiles.", "searchDownloads": "Buscar datafiles...", - "datasetDetails": "Detalles del Datafile" + "datasetDetails": "Detalles del Datafile", + "checkQueue": "Revisa la cola de trabajos para más detalles" } From df12e438a6597666f78cbfc89b60b45f4c97498b Mon Sep 17 00:00:00 2001 From: Creylay Date: Tue, 12 May 2026 15:57:47 -0400 Subject: [PATCH 109/361] Integrate tour context into SelectDataloaderStep and StepperNavigationFooter for enhanced onboarding experience --- .../datasetCreation/SelectDataloaderStep.jsx | 12 ++++++++++++ .../components/shared/StepperNavigationFooter.jsx | 3 +++ 2 files changed, 15 insertions(+) diff --git a/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx b/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx index 21fabf7b4..d478f0eb1 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/SelectDataloaderStep.jsx @@ -26,6 +26,15 @@ export default function SelectDataloaderStep({ const handleNext = () => { goToNextStep(); + if (tourContext?.run) { + const observer = new MutationObserver(() => { + if (document.querySelector('[data-tour="upload-area"]')) { + observer.disconnect(); + tourContext.nextStep(); + } + }); + observer.observe(document.body, { childList: true, subtree: true }); + } }; return ( @@ -71,6 +80,9 @@ export default function SelectDataloaderStep({ onBack={goToPrevStep} onNext={handleNext} nextDisabled={!selectedDataloader?.name} + nextDataTour={ + tourContext?.run ? "dataloader-step-next-button" : undefined + } /> ); diff --git a/DashAI/front/src/components/shared/StepperNavigationFooter.jsx b/DashAI/front/src/components/shared/StepperNavigationFooter.jsx index 2a7c44a10..7a63ed81c 100644 --- a/DashAI/front/src/components/shared/StepperNavigationFooter.jsx +++ b/DashAI/front/src/components/shared/StepperNavigationFooter.jsx @@ -30,6 +30,7 @@ export default function StepperNavigationFooter({ variant = "next", loading = false, sx = {}, + nextDataTour = null, }) { const { t } = useTranslation(["common"]); @@ -65,6 +66,7 @@ export default function StepperNavigationFooter({ variant="contained" onClick={onNext} disabled={nextDisabled || loading} + data-tour={nextDataTour || undefined} sx={{ position: "relative", }} @@ -104,4 +106,5 @@ StepperNavigationFooter.propTypes = { variant: PropTypes.oneOf(["next", "save"]), loading: PropTypes.bool, sx: PropTypes.object, + nextDataTour: PropTypes.string, }; From 797a83b15a63fb40d2dc420e5786c29ee6892ce6 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 15:58:02 -0400 Subject: [PATCH 110/361] feat: click datafile row to import, remove add icon button --- .../front/src/components/hub/HubLeftBar.jsx | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/DashAI/front/src/components/hub/HubLeftBar.jsx b/DashAI/front/src/components/hub/HubLeftBar.jsx index 9b021d740..edc3520ca 100644 --- a/DashAI/front/src/components/hub/HubLeftBar.jsx +++ b/DashAI/front/src/components/hub/HubLeftBar.jsx @@ -7,7 +7,6 @@ import { Tooltip, Typography, } from "@mui/material"; -import AddIcon from "@mui/icons-material/Add"; import DeleteIcon from "@mui/icons-material/Delete"; import KeyboardArrowDownIcon from "@mui/icons-material/KeyboardArrowDown"; import KeyboardArrowRightIcon from "@mui/icons-material/KeyboardArrowRight"; @@ -159,7 +158,15 @@ export default function HubLeftBar({ py: 0.5, px: 1, borderRadius: 1, + cursor: dl.status === "ready" ? "pointer" : "default", + "&:hover": + dl.status === "ready" + ? { bgcolor: (theme) => theme.palette.action.hover } + : {}, }} + onClick={() => + dl.status === "ready" && onImportDownload?.(dl) + } > @@ -181,16 +188,6 @@ export default function HubLeftBar({ - {dl.status === "ready" && ( - - onImportDownload?.(dl)} - > - - - - )} Date: Tue, 12 May 2026 16:10:46 -0400 Subject: [PATCH 111/361] feat: store and display metadata for downloaded datafiles --- .../c3d7a1f05e8b_add_metadata_to_datafile.py | 32 ++++ DashAI/back/api/api_v1/endpoints/datafile.py | 16 ++ DashAI/back/dependencies/database/models.py | 8 +- DashAI/back/job/datafile_job.py | 7 + DashAI/front/src/api/hub.ts | 4 + DashAI/front/src/api/job.ts | 6 + .../src/components/hub/DatafileInfoPanel.jsx | 142 ++++++++++++++++++ DashAI/front/src/pages/hub/HubContent.jsx | 10 +- 8 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 DashAI/alembic/versions/c3d7a1f05e8b_add_metadata_to_datafile.py create mode 100644 DashAI/front/src/components/hub/DatafileInfoPanel.jsx diff --git a/DashAI/alembic/versions/c3d7a1f05e8b_add_metadata_to_datafile.py b/DashAI/alembic/versions/c3d7a1f05e8b_add_metadata_to_datafile.py new file mode 100644 index 000000000..fea1ca339 --- /dev/null +++ b/DashAI/alembic/versions/c3d7a1f05e8b_add_metadata_to_datafile.py @@ -0,0 +1,32 @@ +"""Add metadata columns to datafile table + +Revision ID: c3d7a1f05e8b +Revises: a1c3e5f7b9d2 +Create Date: 2026-05-12 00:00:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "c3d7a1f05e8b" +down_revision: Union[str, None] = "a1c3e5f7b9d2" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("datafile", sa.Column("size_bytes", sa.BigInteger(), nullable=True)) + op.add_column("datafile", sa.Column("description", sa.Text(), nullable=True)) + op.add_column("datafile", sa.Column("tags", sa.Text(), nullable=True)) + op.add_column("datafile", sa.Column("source_url", sa.Text(), nullable=True)) + + +def downgrade() -> None: + op.drop_column("datafile", "source_url") + op.drop_column("datafile", "tags") + op.drop_column("datafile", "description") + op.drop_column("datafile", "size_bytes") diff --git a/DashAI/back/api/api_v1/endpoints/datafile.py b/DashAI/back/api/api_v1/endpoints/datafile.py index fbad92aaa..e64f433c7 100644 --- a/DashAI/back/api/api_v1/endpoints/datafile.py +++ b/DashAI/back/api/api_v1/endpoints/datafile.py @@ -1,5 +1,6 @@ """Datafile management endpoints.""" +import json import logging import os from pathlib import Path @@ -32,6 +33,10 @@ def _row_to_dict(row: Datafile) -> Dict[str, Any]: "local_path": row.local_path, "status": row.status.value, "error_message": row.error_message, + "size_bytes": row.size_bytes, + "description": row.description, + "tags": json.loads(row.tags) if row.tags else [], + "source_url": row.source_url, "created": row.created.isoformat() if row.created else None, "last_modified": row.last_modified.isoformat() if row.last_modified else None, } @@ -41,6 +46,9 @@ class CreateDownloadRequest(BaseModel): source_name: str dataset_id: str name: str + description: str = "" + tags: list[str] = [] + source_url: str = "" @router.get("/", response_model=List[Dict[str, Any]]) @@ -93,6 +101,9 @@ async def create_download( existing.error_message = None existing.local_path = None existing.name = body.name + existing.description = body.description + existing.tags = json.dumps(body.tags) + existing.source_url = body.source_url or None try: db.commit() db.refresh(existing) @@ -108,6 +119,9 @@ async def create_download( source_name=body.source_name, dataset_id=body.dataset_id, name=body.name, + description=body.description, + tags=json.dumps(body.tags), + source_url=body.source_url or None, status=DatafileStatus.DOWNLOADING, ) db.add(row) @@ -129,6 +143,8 @@ async def create_download( "datafile_id": datafile_id, "source_name": body.source_name, "dataset_source_id": body.dataset_id, + "description": body.description, + "tags": body.tags, } ) job_result = job_queue.put(job) diff --git a/DashAI/back/dependencies/database/models.py b/DashAI/back/dependencies/database/models.py index 677433306..983fd8f19 100644 --- a/DashAI/back/dependencies/database/models.py +++ b/DashAI/back/dependencies/database/models.py @@ -1,10 +1,11 @@ import logging import pathlib from datetime import datetime -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from sqlalchemy import ( JSON, + BigInteger, Boolean, DateTime, Enum, @@ -13,6 +14,7 @@ Integer, MetaData, String, + Text, UniqueConstraint, ) from sqlalchemy.ext.declarative import declarative_base @@ -734,6 +736,10 @@ class Datafile(Base): default=DatafileStatus.DOWNLOADING, ) error_message: Mapped[str] = mapped_column(String, nullable=True) + size_bytes: Mapped[Optional[int]] = mapped_column(BigInteger, nullable=True) + description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + tags: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # JSON array + source_url: Mapped[Optional[str]] = mapped_column(Text, nullable=True) created: Mapped[DateTime] = mapped_column(DateTime, default=datetime.now) last_modified: Mapped[DateTime] = mapped_column( DateTime, diff --git a/DashAI/back/job/datafile_job.py b/DashAI/back/job/datafile_job.py index 4aa563212..c16a242e5 100644 --- a/DashAI/back/job/datafile_job.py +++ b/DashAI/back/job/datafile_job.py @@ -1,5 +1,6 @@ """Job for downloading a dataset from an external hub source.""" +import json import logging from pathlib import Path from typing import TYPE_CHECKING @@ -83,6 +84,12 @@ def run(self) -> None: raise JobError(f"Datafile row {datafile_id} not found.") row.status = DatafileStatus.READY row.local_path = str(download_dir) + size_bytes = sum( + f.stat().st_size for f in download_dir.rglob("*") if f.is_file() + ) + row.size_bytes = size_bytes + row.description = self.kwargs.get("description", "") + row.tags = json.dumps(self.kwargs.get("tags", [])) try: db.commit() except exc.SQLAlchemyError as e: diff --git a/DashAI/front/src/api/hub.ts b/DashAI/front/src/api/hub.ts index 2d4b52600..67cd2440b 100644 --- a/DashAI/front/src/api/hub.ts +++ b/DashAI/front/src/api/hub.ts @@ -112,6 +112,10 @@ export interface Datafile { local_path: string | null; status: DatafileStatus; error_message: string | null; + size_bytes: number | null; + description: string; + tags: string[]; + source_url: string | null; created: string | null; last_modified: string | null; job_id?: string; diff --git a/DashAI/front/src/api/job.ts b/DashAI/front/src/api/job.ts index 49d6d3514..e4b5dcff1 100644 --- a/DashAI/front/src/api/job.ts +++ b/DashAI/front/src/api/job.ts @@ -241,11 +241,17 @@ export const enqueueDatafileJob = async ( source_name: string, dataset_id: string, name: string, + description: string = "", + tags: string[] = [], + source_url: string = "", ): Promise => { const response = await api.post("/v1/datafile/", { source_name, dataset_id, name, + description, + tags, + source_url, }); return response.data; }; diff --git a/DashAI/front/src/components/hub/DatafileInfoPanel.jsx b/DashAI/front/src/components/hub/DatafileInfoPanel.jsx new file mode 100644 index 000000000..452c006d6 --- /dev/null +++ b/DashAI/front/src/components/hub/DatafileInfoPanel.jsx @@ -0,0 +1,142 @@ +import { Box, Chip, Divider, Link, Stack, Typography } from "@mui/material"; +import { useTheme } from "@mui/material/styles"; +import { useTranslation } from "react-i18next"; +import SideBar from "../threeSectionLayout/panelContainers/SideBar"; + +const formatSize = (bytes) => { + if (!bytes) return null; + if (bytes < 1024) return `${bytes} B`; + if (bytes < 1024 ** 2) return `${(bytes / 1024).toFixed(1)} KB`; + if (bytes < 1024 ** 3) return `${(bytes / 1024 ** 2).toFixed(1)} MB`; + return `${(bytes / 1024 ** 3).toFixed(1)} GB`; +}; + +/** + * Right panel — metadata view for a downloaded Datafile record. + * + * @param {object} datafile - Datafile record from the DB. + */ +export default function DatafileInfoPanel({ datafile }) { + const { t } = useTranslation(["hub"]); + const theme = useTheme(); + + return ( + + {/* Title */} + + + {t("hub:datasetDetails")} + + + + {/* Content */} + {!datafile ? ( + + + {t("hub:selectDatasetToPreview")} + + + ) : ( + + + + {datafile.name} + + + + + {datafile.description && ( + + {datafile.description} + + )} + + + + + + + {t("hub:source")} + + {datafile.source_name} + + + {datafile.source_url && ( + + + {t("hub:viewOnSource")} + + + + {datafile.source_url} + + + + )} + + + + {t("hub:size")} + + + {formatSize(datafile.size_bytes) ?? t("hub:notAvailable")} + + + + {datafile.tags?.length > 0 && ( + + + {t("hub:tags")} + + + {datafile.tags.map((tag) => ( + + ))} + + + )} + + + + )} + + ); +} diff --git a/DashAI/front/src/pages/hub/HubContent.jsx b/DashAI/front/src/pages/hub/HubContent.jsx index dd943f38d..1332e0a49 100644 --- a/DashAI/front/src/pages/hub/HubContent.jsx +++ b/DashAI/front/src/pages/hub/HubContent.jsx @@ -14,6 +14,7 @@ import HubLeftBar from "../../components/hub/HubLeftBar"; import DatasetGrid from "../../components/hub/DatasetGrid"; import DatasetDetail from "../../components/hub/DatasetDetail"; import HubImportPanel from "../../components/hub/HubImportPanel"; +import DatafileInfoPanel from "../../components/hub/DatafileInfoPanel"; import ComponentDetailsPanel from "../../components/custom/ComponentDetailsPanel"; import DataloaderConfigBar from "../../components/notebooks/datasetCreation/DataloaderConfigBar"; import { @@ -146,6 +147,9 @@ export default function HubContent() { sourceNameParam, selectedDataset.id, selectedDataset.name, + selectedDataset.description ?? "", + selectedDataset.tags ?? [], + selectedDataset.url ?? "", ); setDownloads((prev) => ({ ...prev, @@ -281,7 +285,11 @@ export default function HubContent() { {importMode ? ( importStep === 0 ? ( - + importDownload ? ( + + ) : ( + + ) ) : ( Date: Tue, 12 May 2026 16:38:38 -0400 Subject: [PATCH 112/361] feat: unify right panel layout between DatasetDetail and DatafileInfoPanel --- .../src/components/hub/DatafileInfoPanel.jsx | 33 ++++++++----------- .../src/components/hub/DatasetDetail.jsx | 4 +++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/DashAI/front/src/components/hub/DatafileInfoPanel.jsx b/DashAI/front/src/components/hub/DatafileInfoPanel.jsx index 452c006d6..6f15f6d91 100644 --- a/DashAI/front/src/components/hub/DatafileInfoPanel.jsx +++ b/DashAI/front/src/components/hub/DatafileInfoPanel.jsx @@ -1,4 +1,5 @@ import { Box, Chip, Divider, Link, Stack, Typography } from "@mui/material"; +import OpenInNewIcon from "@mui/icons-material/OpenInNew"; import { useTheme } from "@mui/material/styles"; import { useTranslation } from "react-i18next"; import SideBar from "../threeSectionLayout/panelContainers/SideBar"; @@ -58,9 +59,21 @@ export default function DatafileInfoPanel({ datafile }) { - + {datafile.name} + + {datafile.source_url && ( + + {t("hub:viewOnSource")} + + )} @@ -80,24 +93,6 @@ export default function DatafileInfoPanel({ datafile }) { {datafile.source_name} - {datafile.source_url && ( - - - {t("hub:viewOnSource")} - - - - {datafile.source_url} - - - - )} - {t("hub:size")} diff --git a/DashAI/front/src/components/hub/DatasetDetail.jsx b/DashAI/front/src/components/hub/DatasetDetail.jsx index 597fec103..ae9ba8dbb 100644 --- a/DashAI/front/src/components/hub/DatasetDetail.jsx +++ b/DashAI/front/src/components/hub/DatasetDetail.jsx @@ -259,6 +259,10 @@ export default function DatasetDetail({ label={tag} size="small" variant="outlined" + sx={{ + ...theme.typography.statusBadge, + border: `1px solid ${theme.palette.divider}`, + }} /> ))} From 47576d8876bb28e68c45eae05914a9badd854868 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 17:24:01 -0400 Subject: [PATCH 113/361] fix: correct capitalization in fromSource translation --- DashAI/front/src/utils/i18n/locales/en/hub.json | 2 +- DashAI/front/src/utils/i18n/locales/es/hub.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DashAI/front/src/utils/i18n/locales/en/hub.json b/DashAI/front/src/utils/i18n/locales/en/hub.json index bb65b7616..d6ec73395 100644 --- a/DashAI/front/src/utils/i18n/locales/en/hub.json +++ b/DashAI/front/src/utils/i18n/locales/en/hub.json @@ -51,7 +51,7 @@ "stepFileSubtitle": "Choose the file to import from the downloaded datafile.", "noFilesFound": "No files found in this download.", "sources": "Sources", - "fromSource": "From {{source}}", + "fromSource": "from {{source}}", "selectSourceSubtitle": "Select a source to browse and import datafiles.", "searchDownloads": "Search datafiles...", "datasetDetails": "Datafile Details", diff --git a/DashAI/front/src/utils/i18n/locales/es/hub.json b/DashAI/front/src/utils/i18n/locales/es/hub.json index 099ea52be..a13467468 100644 --- a/DashAI/front/src/utils/i18n/locales/es/hub.json +++ b/DashAI/front/src/utils/i18n/locales/es/hub.json @@ -51,7 +51,7 @@ "stepFileSubtitle": "Elige el archivo a importar de la descarga.", "noFilesFound": "No se encontraron archivos en esta descarga.", "sources": "Fuentes", - "fromSource": "De {{source}}", + "fromSource": "de {{source}}", "selectSourceSubtitle": "Selecciona una fuente para explorar e importar datafiles.", "searchDownloads": "Buscar datafiles...", "datasetDetails": "Detalles del Datafile", From 422d68158e0cc40171416990f40ef2bb5e2e04b8 Mon Sep 17 00:00:00 2001 From: Irozuku Date: Tue, 12 May 2026 17:24:05 -0400 Subject: [PATCH 114/361] feat: add confirmation dialog before deleting a datafile --- .../front/src/components/hub/HubLeftBar.jsx | 54 ++++++++++++++++--- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/DashAI/front/src/components/hub/HubLeftBar.jsx b/DashAI/front/src/components/hub/HubLeftBar.jsx index edc3520ca..c2c32f150 100644 --- a/DashAI/front/src/components/hub/HubLeftBar.jsx +++ b/DashAI/front/src/components/hub/HubLeftBar.jsx @@ -1,7 +1,13 @@ import { useState } from "react"; import { Box, + Button, Collapse, + Dialog, + DialogActions, + DialogContent, + DialogContentText, + DialogTitle, Divider, IconButton, Tooltip, @@ -93,11 +99,17 @@ export default function HubLeftBar({ const theme = useTheme(); const [searchQuery, setSearchQuery] = useState(""); const [downloadsOpen, setDownloadsOpen] = useState(true); + const [pendingDeleteId, setPendingDeleteId] = useState(null); const filteredDownloads = downloads.filter((dl) => dl.name.toLowerCase().includes(searchQuery.toLowerCase()), ); + const handleDeleteConfirm = () => { + onDeleteDownload?.(pendingDeleteId); + setPendingDeleteId(null); + }; + return ( - onDeleteDownload?.(dl.id)} - disabled={dl.status === "downloading"} - > - - + + { + e.stopPropagation(); + setPendingDeleteId(dl.id); + }} + disabled={dl.status === "downloading"} + > + + + @@ -207,6 +227,26 @@ export default function HubLeftBar({