diff --git a/DashAI/back/api/api_v1/endpoints/datasets.py b/DashAI/back/api/api_v1/endpoints/datasets.py
index 0b56e83c6..9a646a5d1 100644
--- a/DashAI/back/api/api_v1/endpoints/datasets.py
+++ b/DashAI/back/api/api_v1/endpoints/datasets.py
@@ -1813,6 +1813,7 @@ async def preview_with_types(
try:
inference_rows = parsed_params.get("inference_rows", 1000)
+ use_native_types = parsed_params.get("use_native_types", False)
dataloader_name = parsed_params.get("dataloader_name")
if not dataloader_name:
@@ -1830,6 +1831,11 @@ async def preview_with_types(
dataloader_cls = component_registry[dataloader_name]["class"]
dataloader = dataloader_cls()
+ native_types = None
+ should_use_native = (
+ use_native_types and dataloader_cls.SUPPORTS_NATIVE_TYPES
+ )
+
if file.filename.endswith(".zip"):
allowed_exts = dataloader_cls.SUPPORTED_EXTENSIONS
extract_dir = tempfile.mkdtemp()
@@ -1915,6 +1921,11 @@ async def preview_with_types(
n_rows=inference_rows,
)
+ if should_use_native:
+ native_types = dataloader.extract_native_types(
+ matched_file, parsed_params
+ )
+
finally:
with contextlib.suppress(Exception):
shutil.rmtree(extract_dir, ignore_errors=True)
@@ -1926,17 +1937,25 @@ async def preview_with_types(
n_rows=inference_rows,
)
+ if should_use_native:
+ native_types = dataloader.extract_native_types(
+ tmp_file_path, parsed_params
+ )
+
sample_df = loaded_dataset.head(100)
table = pa.Table.from_pandas(loaded_dataset)
arrow_schema = arrow_to_dashai_schema(table)
- methods = parsed_params.get("methods", ["DashAIPtype"])
- inferred_types = {}
+ if native_types is not None:
+ inferred_types = native_types
+ else:
+ methods = parsed_params.get("methods", ["DashAIPtype"])
+ inferred_types = {}
- for method in methods:
- method_types = infer_types(loaded_dataset, method=method)
- inferred_types.update(method_types)
+ for method in methods:
+ method_types = infer_types(loaded_dataset, method=method)
+ inferred_types.update(method_types)
sample_df = sample_df.replace({np.nan: None, np.inf: None, -np.inf: None})
sample = sample_df.to_dict(orient="records")
diff --git a/DashAI/back/dataloaders/classes/arff_dataloader.py b/DashAI/back/dataloaders/classes/arff_dataloader.py
index 11a3abf7a..7f6ee93ef 100644
--- a/DashAI/back/dataloaders/classes/arff_dataloader.py
+++ b/DashAI/back/dataloaders/classes/arff_dataloader.py
@@ -30,6 +30,19 @@ class ARFFDataLoader(BaseDataLoader):
SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".arff", ".zip"})
COMPATIBLE_COMPONENTS = ["TabularClassificationTask"]
SCHEMA = ARFFDataloaderSchema
+ SUPPORTS_NATIVE_TYPES: bool = True
+ NATIVE_TYPE_MAPPING: Dict[str, Dict[str, Any]] = {
+ "numeric": {"type": "Float", "dtype": "float64"},
+ "real": {"type": "Float", "dtype": "float64"},
+ "integer": {"type": "Integer", "dtype": "int64"},
+ "nominal": {
+ "type": "Categorical",
+ "dtype": "string",
+ "encoder": "one_hot",
+ },
+ "string": {"type": "Text", "dtype": "string", "encoding": "utf-8"},
+ "date": {"type": "Text", "dtype": "string", "encoding": "utf-8"},
+ }
DESCRIPTION: str = MultilingualString(
en=(
@@ -56,6 +69,25 @@ class ARFFDataLoader(BaseDataLoader):
pt="Carregador de Dados ARFF",
)
+ def _load_arff_raw(self, filepath: str):
+ """Read raw scipy ARFF ``(data, meta)`` tuple.
+
+ Centralises the scipy call so the metadata object (discarded by
+ ``_read_arff_file``) is available to ``extract_native_types``.
+
+ Raises
+ ------
+ datasets.builder.DatasetGenerationError
+ If the file cannot be parsed as valid ARFF.
+ """
+ from datasets.builder import DatasetGenerationError
+ from scipy.io import arff
+
+ try:
+ return arff.loadarff(filepath)
+ except Exception as e:
+ raise DatasetGenerationError from e
+
def _read_arff_file(self, filepath: str):
"""Read an ARFF file and return a pandas DataFrame.
@@ -75,20 +107,71 @@ def _read_arff_file(self, filepath: str):
If the file cannot be parsed as valid ARFF.
"""
import pandas as pd
- from datasets.builder import DatasetGenerationError
- from scipy.io import arff
-
- try:
- data, _ = arff.loadarff(filepath)
- except Exception as e:
- raise DatasetGenerationError from e
+ data, _ = self._load_arff_raw(filepath)
arff_df = pd.DataFrame(data)
for col in arff_df.columns:
if arff_df[col].dtype == object:
arff_df[col] = arff_df[col].str.decode("utf-8")
return arff_df
+ @staticmethod
+ def _decode_if_bytes(value: Any) -> Any:
+ """Return ``value`` UTF-8 decoded if it is bytes, otherwise unchanged."""
+ return value.decode("utf-8") if isinstance(value, bytes) else value
+
+ def extract_native_types(
+ self,
+ filepath_or_buffer: str,
+ params: Dict[str, Any],
+ ) -> Dict[str, Dict[str, Any]]:
+ """Build the DashAI column-type map from the ARFF header itself.
+
+ Reads the scipy metadata object and converts each declared attribute
+ kind (``numeric``, ``integer``, ``real``, ``nominal``, ``string``,
+ ``date``) into the same dict shape used by
+ ``DashAIPtype.infer_types``. For ``nominal`` attributes the
+ category list comes straight from the ARFF header (e.g.
+ ``@attribute color {red, green, blue}``), no statistical guess.
+
+ Parameters
+ ----------
+ filepath_or_buffer : str
+ Path to a single ARFF file already on disk.
+ params : Dict[str, Any]
+ Unused (ARFF needs no parameters).
+
+ Returns
+ -------
+ Dict[str, Dict[str, Any]]
+ Column name -> DashAI type dict.
+ """
+ _, meta = self._load_arff_raw(filepath_or_buffer)
+
+ native_types: Dict[str, Dict[str, Any]] = {}
+ for col_name in meta.names():
+ kind, values = meta[col_name]
+ kind_key = kind.lower() if isinstance(kind, str) else "string"
+
+ if kind_key in self.NATIVE_TYPE_MAPPING:
+ info = self.NATIVE_TYPE_MAPPING[kind_key].copy()
+ else:
+ info = {"type": "Text", "dtype": "string"}
+
+ if kind_key == "nominal" and values is not None:
+ info["categories"] = [self._decode_if_bytes(v) for v in values]
+
+ info["inference_reason"] = {
+ "source": "arff_metadata",
+ "native_type": kind_key,
+ "final_type": info.get("type"),
+ "is_categorical": kind_key == "nominal",
+ }
+
+ native_types[col_name] = info
+
+ return native_types
+
def load_data(
self,
filepath_or_buffer: str,
diff --git a/DashAI/back/dataloaders/classes/dataloader.py b/DashAI/back/dataloaders/classes/dataloader.py
index c04c1f0b1..b8c912b8c 100644
--- a/DashAI/back/dataloaders/classes/dataloader.py
+++ b/DashAI/back/dataloaders/classes/dataloader.py
@@ -16,7 +16,16 @@
class BaseDataLoader(ConfigObject):
- """Abstract class with base methods for DashAI dataloaders."""
+ """Abstract class with base methods for DashAI dataloaders.
+
+ Subclasses that handle self-describing formats (formats whose files carry
+ explicit column-type metadata, e.g. ARFF, Parquet, Feather) may set
+ ``SUPPORTS_NATIVE_TYPES = True`` and implement ``extract_native_types``
+ to expose those native types directly, bypassing statistical type
+ inference. Subclasses may also declare a ``NATIVE_TYPE_MAPPING`` class
+ attribute as a convenient lookup from format-specific type names to
+ DashAI type dicts (the same shape produced by ``DashAIPtype.infer_types``).
+ """
TYPE: Final[str] = "DataLoader"
CATEGORY: Final = MultilingualString(
@@ -25,14 +34,45 @@ class BaseDataLoader(ConfigObject):
pt="Carregamento de Arquivos",
)
SUPPORTED_EXTENSIONS: frozenset[str] = frozenset()
+ SUPPORTS_NATIVE_TYPES: bool = False
+ NATIVE_TYPE_MAPPING: Dict[str, Dict[str, Any]] = {}
@classmethod
def get_metadata(cls) -> Dict[str, Any]:
return {
"category": cls.CATEGORY if cls.CATEGORY else "File Uploading",
"supported_extensions": sorted(cls.SUPPORTED_EXTENSIONS),
+ "supports_native_types": cls.SUPPORTS_NATIVE_TYPES,
}
+ def extract_native_types(
+ self,
+ filepath_or_buffer: str,
+ params: Dict[str, Any],
+ ) -> Dict[str, Dict[str, Any]] | None:
+ """Extract column types from the file's own metadata, if available.
+
+ Default implementation returns ``None``, meaning the format does not
+ carry native type info. Subclasses for self-describing formats
+ override this and return one entry per column with the same dict
+ shape produced by ``DashAIPtype.infer_types`` (keys: ``type``,
+ ``dtype``, plus ``categories``/``encoder`` for ``Categorical``).
+
+ Parameters
+ ----------
+ filepath_or_buffer : str
+ Path to the file already prepared on disk (post-ZIP extraction).
+ params : Dict[str, Any]
+ Dataloader parameters, same dict that ``load_preview`` receives.
+
+ Returns
+ -------
+ Dict[str, Dict[str, Any]] | None
+ Column name -> DashAI type dict, or ``None`` if the format does
+ not provide native types.
+ """
+ return None
+
@abstractmethod
def load_data(
self,
diff --git a/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx b/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx
index 531717d6a..b406f1a02 100644
--- a/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx
+++ b/DashAI/front/src/components/notebooks/datasetCreation/DataloaderConfigBar.jsx
@@ -1,6 +1,6 @@
-import { Box, TextField, Typography } from "@mui/material";
+import { Box, Switch, TextField, Typography } from "@mui/material";
import PropTypes from "prop-types";
-import { useCallback, useRef, useState } from "react";
+import { useCallback, useEffect, useRef, useState } from "react";
import { useTheme } from "@mui/material/styles";
import FormSchema from "../../shared/FormSchema";
import FormSchemaContainer from "../../shared/FormSchemaContainer";
@@ -10,6 +10,7 @@ import { generateSequentialName } from "../../../utils/nameGenerator";
import FormSchemaFieldCard from "../../shared/FormSchemaFieldCard";
import { useTranslation } from "react-i18next";
import SideBar from "../../threeSectionLayout/panelContainers/SideBar";
+import { getComponents as getComponentsRequest } from "../../../api/component";
/**
* Right sidebar component for configuring dataloader parameters
@@ -28,19 +29,61 @@ export default function DataloaderConfigBar({
onValuesChange,
}) {
const [inferenceRows, setInferenceRows] = useState(1000);
+ const [supportsNativeTypes, setSupportsNativeTypes] = useState(false);
+ const [useNativeTypes, setUseNativeTypes] = useState(false);
const schemaValuesRef = useRef({});
const { t } = useTranslation(["common", "datasets"]);
const theme = useTheme();
const showInferenceRows = selectedDataloader !== "ImageDataLoader";
- // Handler for when FormSchema values change - merge with inference_rows
+ useEffect(() => {
+ let cancelled = false;
+ setUseNativeTypes(false);
+ if (!selectedDataloader) {
+ setSupportsNativeTypes(false);
+ return () => {
+ cancelled = true;
+ };
+ }
+ getComponentsRequest({ model: selectedDataloader })
+ .then((components) => {
+ if (cancelled) return;
+ const component = Array.isArray(components)
+ ? components[0]
+ : components;
+ const flag = !!component?.metadata?.supports_native_types;
+ setSupportsNativeTypes(flag);
+ if (flag) {
+ setUseNativeTypes(true);
+ if (onValuesChange) {
+ onValuesChange({
+ ...schemaValuesRef.current,
+ inference_rows: inferenceRows,
+ use_native_types: true,
+ });
+ }
+ }
+ })
+ .catch(() => {
+ if (!cancelled) setSupportsNativeTypes(false);
+ });
+ return () => {
+ cancelled = true;
+ };
+ }, [selectedDataloader]);
+
+ // Handler for when FormSchema values change - merge with inference_rows + native flag
const handleFormSchemaValuesChange = useCallback(() => {
const values = formSubmitRef?.current?.values || {};
schemaValuesRef.current = values;
if (onValuesChange) {
- onValuesChange({ ...values, inference_rows: inferenceRows });
+ onValuesChange({
+ ...values,
+ inference_rows: inferenceRows,
+ use_native_types: useNativeTypes,
+ });
}
- }, [formSubmitRef, inferenceRows, onValuesChange]);
+ }, [formSubmitRef, inferenceRows, useNativeTypes, onValuesChange]);
// Handler for when inference_rows changes - merge with schema values
const handleInferenceRowsChange = useCallback(
@@ -48,10 +91,29 @@ export default function DataloaderConfigBar({
const numeric = val ? Math.max(2, Number(val)) : 2;
setInferenceRows(numeric);
if (onValuesChange) {
- onValuesChange({ ...schemaValuesRef.current, inference_rows: numeric });
+ onValuesChange({
+ ...schemaValuesRef.current,
+ inference_rows: numeric,
+ use_native_types: useNativeTypes,
+ });
+ }
+ },
+ [onValuesChange, useNativeTypes],
+ );
+
+ const handleUseNativeTypesChange = useCallback(
+ (event) => {
+ const next = event.target.checked;
+ setUseNativeTypes(next);
+ if (onValuesChange) {
+ onValuesChange({
+ ...schemaValuesRef.current,
+ inference_rows: inferenceRows,
+ use_native_types: next,
+ });
}
},
- [onValuesChange],
+ [onValuesChange, inferenceRows],
);
if (!selectedDataloader) {
@@ -101,9 +163,34 @@ export default function DataloaderConfigBar({
pb: 2,
}}
>
+ {supportsNativeTypes && (
+
+
+
+
+
+
+
+ )}