diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py index 0b56e83c6..9a646a5d1 100644 --- a/DashAI/back/api/api_v1/endpoints/datasets.py +++ b/DashAI/back/api/api_v1/endpoints/datasets.py @@ -1813,6 +1813,7 @@ async def preview_with_types( try: inference_rows = parsed_params.get("inference_rows", 1000) + use_native_types = parsed_params.get("use_native_types", False) dataloader_name = parsed_params.get("dataloader_name") if not dataloader_name: @@ -1830,6 +1831,11 @@ async def preview_with_types( dataloader_cls = component_registry[dataloader_name]["class"] dataloader = dataloader_cls() + native_types = None + should_use_native = ( + use_native_types and dataloader_cls.SUPPORTS_NATIVE_TYPES + ) + if file.filename.endswith(".zip"): allowed_exts = dataloader_cls.SUPPORTED_EXTENSIONS extract_dir = tempfile.mkdtemp() @@ -1915,6 +1921,11 @@ async def preview_with_types( n_rows=inference_rows, ) + if should_use_native: + native_types = dataloader.extract_native_types( + matched_file, parsed_params + ) + finally: with contextlib.suppress(Exception): shutil.rmtree(extract_dir, ignore_errors=True) @@ -1926,17 +1937,25 @@ async def preview_with_types( n_rows=inference_rows, ) + if should_use_native: + native_types = dataloader.extract_native_types( + tmp_file_path, parsed_params + ) + sample_df = loaded_dataset.head(100) table = pa.Table.from_pandas(loaded_dataset) arrow_schema = arrow_to_dashai_schema(table) - methods = parsed_params.get("methods", ["DashAIPtype"]) - inferred_types = {} + if native_types is not None: + inferred_types = native_types + else: + methods = parsed_params.get("methods", ["DashAIPtype"]) + inferred_types = {} - for method in methods: - method_types = infer_types(loaded_dataset, method=method) - inferred_types.update(method_types) + for method in methods: + method_types = infer_types(loaded_dataset, method=method) + inferred_types.update(method_types) sample_df = sample_df.replace({np.nan: None, np.inf: None, -np.inf: None}) sample = sample_df.to_dict(orient="records") diff --git a/DashAI/back/dataloaders/classes/arff_dataloader.py b/DashAI/back/dataloaders/classes/arff_dataloader.py index 11a3abf7a..7f6ee93ef 100644 --- a/DashAI/back/dataloaders/classes/arff_dataloader.py +++ b/DashAI/back/dataloaders/classes/arff_dataloader.py @@ -30,6 +30,19 @@ class ARFFDataLoader(BaseDataLoader): SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".arff", ".zip"}) COMPATIBLE_COMPONENTS = ["TabularClassificationTask"] SCHEMA = ARFFDataloaderSchema + SUPPORTS_NATIVE_TYPES: bool = True + NATIVE_TYPE_MAPPING: Dict[str, Dict[str, Any]] = { + "numeric": {"type": "Float", "dtype": "float64"}, + "real": {"type": "Float", "dtype": "float64"}, + "integer": {"type": "Integer", "dtype": "int64"}, + "nominal": { + "type": "Categorical", + "dtype": "string", + "encoder": "one_hot", + }, + "string": {"type": "Text", "dtype": "string", "encoding": "utf-8"}, + "date": {"type": "Text", "dtype": "string", "encoding": "utf-8"}, + } DESCRIPTION: str = MultilingualString( en=( @@ -56,6 +69,25 @@ class ARFFDataLoader(BaseDataLoader): pt="Carregador de Dados ARFF", ) + def _load_arff_raw(self, filepath: str): + """Read raw scipy ARFF ``(data, meta)`` tuple. + + Centralises the scipy call so the metadata object (discarded by + ``_read_arff_file``) is available to ``extract_native_types``. + + Raises + ------ + datasets.builder.DatasetGenerationError + If the file cannot be parsed as valid ARFF. + """ + from datasets.builder import DatasetGenerationError + from scipy.io import arff + + try: + return arff.loadarff(filepath) + except Exception as e: + raise DatasetGenerationError from e + def _read_arff_file(self, filepath: str): """Read an ARFF file and return a pandas DataFrame. @@ -75,20 +107,71 @@ def _read_arff_file(self, filepath: str): If the file cannot be parsed as valid ARFF. """ import pandas as pd - from datasets.builder import DatasetGenerationError - from scipy.io import arff - - try: - data, _ = arff.loadarff(filepath) - except Exception as e: - raise DatasetGenerationError from e + data, _ = self._load_arff_raw(filepath) arff_df = pd.DataFrame(data) for col in arff_df.columns: if arff_df[col].dtype == object: arff_df[col] = arff_df[col].str.decode("utf-8") return arff_df + @staticmethod + def _decode_if_bytes(value: Any) -> Any: + """Return ``value`` UTF-8 decoded if it is bytes, otherwise unchanged.""" + return value.decode("utf-8") if isinstance(value, bytes) else value + + def extract_native_types( + self, + filepath_or_buffer: str, + params: Dict[str, Any], + ) -> Dict[str, Dict[str, Any]]: + """Build the DashAI column-type map from the ARFF header itself. + + Reads the scipy metadata object and converts each declared attribute + kind (``numeric``, ``integer``, ``real``, ``nominal``, ``string``, + ``date``) into the same dict shape used by + ``DashAIPtype.infer_types``. For ``nominal`` attributes the + category list comes straight from the ARFF header (e.g. + ``@attribute color {red, green, blue}``), no statistical guess. + + Parameters + ---------- + filepath_or_buffer : str + Path to a single ARFF file already on disk. + params : Dict[str, Any] + Unused (ARFF needs no parameters). + + Returns + ------- + Dict[str, Dict[str, Any]] + Column name -> DashAI type dict. + """ + _, meta = self._load_arff_raw(filepath_or_buffer) + + native_types: Dict[str, Dict[str, Any]] = {} + for col_name in meta.names(): + kind, values = meta[col_name] + kind_key = kind.lower() if isinstance(kind, str) else "string" + + if kind_key in self.NATIVE_TYPE_MAPPING: + info = self.NATIVE_TYPE_MAPPING[kind_key].copy() + else: + info = {"type": "Text", "dtype": "string"} + + if kind_key == "nominal" and values is not None: + info["categories"] = [self._decode_if_bytes(v) for v in values] + + info["inference_reason"] = { + "source": "arff_metadata", + "native_type": kind_key, + "final_type": info.get("type"), + "is_categorical": kind_key == "nominal", + } + + native_types[col_name] = info + + return native_types + def load_data( self, filepath_or_buffer: str, diff --git a/DashAI/back/dataloaders/classes/dataloader.py b/DashAI/back/dataloaders/classes/dataloader.py index c04c1f0b1..b8c912b8c 100644 --- a/DashAI/back/dataloaders/classes/dataloader.py +++ b/DashAI/back/dataloaders/classes/dataloader.py @@ -16,7 +16,16 @@ class BaseDataLoader(ConfigObject): - """Abstract class with base methods for DashAI dataloaders.""" + """Abstract class with base methods for DashAI dataloaders. + + Subclasses that handle self-describing formats (formats whose files carry + explicit column-type metadata, e.g. ARFF, Parquet, Feather) may set + ``SUPPORTS_NATIVE_TYPES = True`` and implement ``extract_native_types`` + to expose those native types directly, bypassing statistical type + inference. Subclasses may also declare a ``NATIVE_TYPE_MAPPING`` class + attribute as a convenient lookup from format-specific type names to + DashAI type dicts (the same shape produced by ``DashAIPtype.infer_types``). + """ TYPE: Final[str] = "DataLoader" CATEGORY: Final = MultilingualString( @@ -25,14 +34,45 @@ class BaseDataLoader(ConfigObject): pt="Carregamento de Arquivos", ) SUPPORTED_EXTENSIONS: frozenset[str] = frozenset() + SUPPORTS_NATIVE_TYPES: bool = False + NATIVE_TYPE_MAPPING: Dict[str, Dict[str, Any]] = {} @classmethod def get_metadata(cls) -> Dict[str, Any]: return { "category": cls.CATEGORY if cls.CATEGORY else "File Uploading", "supported_extensions": sorted(cls.SUPPORTED_EXTENSIONS), + "supports_native_types": cls.SUPPORTS_NATIVE_TYPES, } + def extract_native_types( + self, + filepath_or_buffer: str, + params: Dict[str, Any], + ) -> Dict[str, Dict[str, Any]] | None: + """Extract column types from the file's own metadata, if available. + + Default implementation returns ``None``, meaning the format does not + carry native type info. Subclasses for self-describing formats + override this and return one entry per column with the same dict + shape produced by ``DashAIPtype.infer_types`` (keys: ``type``, + ``dtype``, plus ``categories``/``encoder`` for ``Categorical``). + + Parameters + ---------- + filepath_or_buffer : str + Path to the file already prepared on disk (post-ZIP extraction). + params : Dict[str, Any] + Dataloader parameters, same dict that ``load_preview`` receives. + + Returns + ------- + Dict[str, Dict[str, Any]] | None + Column name -> DashAI type dict, or ``None`` if the format does + not provide native types. + """ + return None + @abstractmethod def load_data( self, diff --git a/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx b/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx index 531717d6a..b406f1a02 100644 --- a/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx +++ b/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx @@ -1,6 +1,6 @@ -import { Box, TextField, Typography } from "@mui/material"; +import { Box, Switch, TextField, Typography } from "@mui/material"; import PropTypes from "prop-types"; -import { useCallback, useRef, useState } from "react"; +import { useCallback, useEffect, useRef, useState } from "react"; import { useTheme } from "@mui/material/styles"; import FormSchema from "../../shared/FormSchema"; import FormSchemaContainer from "../../shared/FormSchemaContainer"; @@ -10,6 +10,7 @@ import { generateSequentialName } from "../../../utils/nameGenerator"; import FormSchemaFieldCard from "../../shared/FormSchemaFieldCard"; import { useTranslation } from "react-i18next"; import SideBar from "../../threeSectionLayout/panelContainers/SideBar"; +import { getComponents as getComponentsRequest } from "../../../api/component"; /** * Right sidebar component for configuring dataloader parameters @@ -28,19 +29,61 @@ export default function DataloaderConfigBar({ onValuesChange, }) { const [inferenceRows, setInferenceRows] = useState(1000); + const [supportsNativeTypes, setSupportsNativeTypes] = useState(false); + const [useNativeTypes, setUseNativeTypes] = useState(false); const schemaValuesRef = useRef({}); const { t } = useTranslation(["common", "datasets"]); const theme = useTheme(); const showInferenceRows = selectedDataloader !== "ImageDataLoader"; - // Handler for when FormSchema values change - merge with inference_rows + useEffect(() => { + let cancelled = false; + setUseNativeTypes(false); + if (!selectedDataloader) { + setSupportsNativeTypes(false); + return () => { + cancelled = true; + }; + } + getComponentsRequest({ model: selectedDataloader }) + .then((components) => { + if (cancelled) return; + const component = Array.isArray(components) + ? components[0] + : components; + const flag = !!component?.metadata?.supports_native_types; + setSupportsNativeTypes(flag); + if (flag) { + setUseNativeTypes(true); + if (onValuesChange) { + onValuesChange({ + ...schemaValuesRef.current, + inference_rows: inferenceRows, + use_native_types: true, + }); + } + } + }) + .catch(() => { + if (!cancelled) setSupportsNativeTypes(false); + }); + return () => { + cancelled = true; + }; + }, [selectedDataloader]); + + // Handler for when FormSchema values change - merge with inference_rows + native flag const handleFormSchemaValuesChange = useCallback(() => { const values = formSubmitRef?.current?.values || {}; schemaValuesRef.current = values; if (onValuesChange) { - onValuesChange({ ...values, inference_rows: inferenceRows }); + onValuesChange({ + ...values, + inference_rows: inferenceRows, + use_native_types: useNativeTypes, + }); } - }, [formSubmitRef, inferenceRows, onValuesChange]); + }, [formSubmitRef, inferenceRows, useNativeTypes, onValuesChange]); // Handler for when inference_rows changes - merge with schema values const handleInferenceRowsChange = useCallback( @@ -48,10 +91,29 @@ export default function DataloaderConfigBar({ const numeric = val ? Math.max(2, Number(val)) : 2; setInferenceRows(numeric); if (onValuesChange) { - onValuesChange({ ...schemaValuesRef.current, inference_rows: numeric }); + onValuesChange({ + ...schemaValuesRef.current, + inference_rows: numeric, + use_native_types: useNativeTypes, + }); + } + }, + [onValuesChange, useNativeTypes], + ); + + const handleUseNativeTypesChange = useCallback( + (event) => { + const next = event.target.checked; + setUseNativeTypes(next); + if (onValuesChange) { + onValuesChange({ + ...schemaValuesRef.current, + inference_rows: inferenceRows, + use_native_types: next, + }); } }, - [onValuesChange], + [onValuesChange, inferenceRows], ); if (!selectedDataloader) { @@ -101,9 +163,34 @@ export default function DataloaderConfigBar({ pb: 2, }} > + {supportsNativeTypes && ( + + + + + + + + )}