Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions DashAI/back/api/api_v1/endpoints/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1813,6 +1813,7 @@ async def preview_with_types(

try:
inference_rows = parsed_params.get("inference_rows", 1000)
use_native_types = parsed_params.get("use_native_types", False)
dataloader_name = parsed_params.get("dataloader_name")

if not dataloader_name:
Expand All @@ -1830,6 +1831,11 @@ async def preview_with_types(
dataloader_cls = component_registry[dataloader_name]["class"]
dataloader = dataloader_cls()

native_types = None
should_use_native = (
use_native_types and dataloader_cls.SUPPORTS_NATIVE_TYPES
)

if file.filename.endswith(".zip"):
allowed_exts = dataloader_cls.SUPPORTED_EXTENSIONS
extract_dir = tempfile.mkdtemp()
Expand Down Expand Up @@ -1915,6 +1921,11 @@ async def preview_with_types(
n_rows=inference_rows,
)

if should_use_native:
native_types = dataloader.extract_native_types(
matched_file, parsed_params
)

finally:
with contextlib.suppress(Exception):
shutil.rmtree(extract_dir, ignore_errors=True)
Expand All @@ -1926,17 +1937,25 @@ async def preview_with_types(
n_rows=inference_rows,
)

if should_use_native:
native_types = dataloader.extract_native_types(
tmp_file_path, parsed_params
)

sample_df = loaded_dataset.head(100)

table = pa.Table.from_pandas(loaded_dataset)
arrow_schema = arrow_to_dashai_schema(table)

methods = parsed_params.get("methods", ["DashAIPtype"])
inferred_types = {}
if native_types is not None:
inferred_types = native_types
else:
methods = parsed_params.get("methods", ["DashAIPtype"])
inferred_types = {}

for method in methods:
method_types = infer_types(loaded_dataset, method=method)
inferred_types.update(method_types)
for method in methods:
method_types = infer_types(loaded_dataset, method=method)
inferred_types.update(method_types)

sample_df = sample_df.replace({np.nan: None, np.inf: None, -np.inf: None})
sample = sample_df.to_dict(orient="records")
Expand Down
97 changes: 90 additions & 7 deletions DashAI/back/dataloaders/classes/arff_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ class ARFFDataLoader(BaseDataLoader):
SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".arff", ".zip"})
COMPATIBLE_COMPONENTS = ["TabularClassificationTask"]
SCHEMA = ARFFDataloaderSchema
SUPPORTS_NATIVE_TYPES: bool = True
NATIVE_TYPE_MAPPING: Dict[str, Dict[str, Any]] = {
"numeric": {"type": "Float", "dtype": "float64"},
"real": {"type": "Float", "dtype": "float64"},
"integer": {"type": "Integer", "dtype": "int64"},
"nominal": {
"type": "Categorical",
"dtype": "string",
"encoder": "one_hot",
},
"string": {"type": "Text", "dtype": "string", "encoding": "utf-8"},
"date": {"type": "Text", "dtype": "string", "encoding": "utf-8"},
}

DESCRIPTION: str = MultilingualString(
en=(
Expand All @@ -56,6 +69,25 @@ class ARFFDataLoader(BaseDataLoader):
pt="Carregador de Dados ARFF",
)

def _load_arff_raw(self, filepath: str):
"""Read raw scipy ARFF ``(data, meta)`` tuple.

Centralises the scipy call so the metadata object (discarded by
``_read_arff_file``) is available to ``extract_native_types``.

Raises
------
datasets.builder.DatasetGenerationError
If the file cannot be parsed as valid ARFF.
"""
from datasets.builder import DatasetGenerationError
from scipy.io import arff

try:
return arff.loadarff(filepath)
except Exception as e:
raise DatasetGenerationError from e

def _read_arff_file(self, filepath: str):
"""Read an ARFF file and return a pandas DataFrame.

Expand All @@ -75,20 +107,71 @@ def _read_arff_file(self, filepath: str):
If the file cannot be parsed as valid ARFF.
"""
import pandas as pd
from datasets.builder import DatasetGenerationError
from scipy.io import arff

try:
data, _ = arff.loadarff(filepath)
except Exception as e:
raise DatasetGenerationError from e

data, _ = self._load_arff_raw(filepath)
arff_df = pd.DataFrame(data)
for col in arff_df.columns:
if arff_df[col].dtype == object:
arff_df[col] = arff_df[col].str.decode("utf-8")
return arff_df

@staticmethod
def _decode_if_bytes(value: Any) -> Any:
"""Return ``value`` UTF-8 decoded if it is bytes, otherwise unchanged."""
return value.decode("utf-8") if isinstance(value, bytes) else value

def extract_native_types(
self,
filepath_or_buffer: str,
params: Dict[str, Any],
) -> Dict[str, Dict[str, Any]]:
"""Build the DashAI column-type map from the ARFF header itself.

Reads the scipy metadata object and converts each declared attribute
kind (``numeric``, ``integer``, ``real``, ``nominal``, ``string``,
``date``) into the same dict shape used by
``DashAIPtype.infer_types``. For ``nominal`` attributes the
category list comes straight from the ARFF header (e.g.
``@attribute color {red, green, blue}``), no statistical guess.

Parameters
----------
filepath_or_buffer : str
Path to a single ARFF file already on disk.
params : Dict[str, Any]
Unused (ARFF needs no parameters).

Returns
-------
Dict[str, Dict[str, Any]]
Column name -> DashAI type dict.
"""
_, meta = self._load_arff_raw(filepath_or_buffer)

native_types: Dict[str, Dict[str, Any]] = {}
for col_name in meta.names():
kind, values = meta[col_name]
kind_key = kind.lower() if isinstance(kind, str) else "string"

if kind_key in self.NATIVE_TYPE_MAPPING:
info = self.NATIVE_TYPE_MAPPING[kind_key].copy()
else:
info = {"type": "Text", "dtype": "string"}

if kind_key == "nominal" and values is not None:
info["categories"] = [self._decode_if_bytes(v) for v in values]

info["inference_reason"] = {
"source": "arff_metadata",
"native_type": kind_key,
"final_type": info.get("type"),
"is_categorical": kind_key == "nominal",
}

native_types[col_name] = info

return native_types

def load_data(
self,
filepath_or_buffer: str,
Expand Down
42 changes: 41 additions & 1 deletion DashAI/back/dataloaders/classes/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,16 @@


class BaseDataLoader(ConfigObject):
"""Abstract class with base methods for DashAI dataloaders."""
"""Abstract class with base methods for DashAI dataloaders.

Subclasses that handle self-describing formats (formats whose files carry
explicit column-type metadata, e.g. ARFF, Parquet, Feather) may set
``SUPPORTS_NATIVE_TYPES = True`` and implement ``extract_native_types``
to expose those native types directly, bypassing statistical type
inference. Subclasses may also declare a ``NATIVE_TYPE_MAPPING`` class
attribute as a convenient lookup from format-specific type names to
DashAI type dicts (the same shape produced by ``DashAIPtype.infer_types``).
"""

TYPE: Final[str] = "DataLoader"
CATEGORY: Final = MultilingualString(
Expand All @@ -25,14 +34,45 @@ class BaseDataLoader(ConfigObject):
pt="Carregamento de Arquivos",
)
SUPPORTED_EXTENSIONS: frozenset[str] = frozenset()
SUPPORTS_NATIVE_TYPES: bool = False
NATIVE_TYPE_MAPPING: Dict[str, Dict[str, Any]] = {}

@classmethod
def get_metadata(cls) -> Dict[str, Any]:
return {
"category": cls.CATEGORY if cls.CATEGORY else "File Uploading",
"supported_extensions": sorted(cls.SUPPORTED_EXTENSIONS),
"supports_native_types": cls.SUPPORTS_NATIVE_TYPES,
}

def extract_native_types(
self,
filepath_or_buffer: str,
params: Dict[str, Any],
) -> Dict[str, Dict[str, Any]] | None:
"""Extract column types from the file's own metadata, if available.

Default implementation returns ``None``, meaning the format does not
carry native type info. Subclasses for self-describing formats
override this and return one entry per column with the same dict
shape produced by ``DashAIPtype.infer_types`` (keys: ``type``,
``dtype``, plus ``categories``/``encoder`` for ``Categorical``).

Parameters
----------
filepath_or_buffer : str
Path to the file already prepared on disk (post-ZIP extraction).
params : Dict[str, Any]
Dataloader parameters, same dict that ``load_preview`` receives.

Returns
-------
Dict[str, Dict[str, Any]] | None
Column name -> DashAI type dict, or ``None`` if the format does
not provide native types.
"""
return None

@abstractmethod
def load_data(
self,
Expand Down
Loading
Loading