From 6618d371492bebec9afcee5a249c44ba1e951ba3 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 28 May 2026 05:47:03 +0000
Subject: [PATCH 1/5] feat: add MXFP4/MXFP8 quantization support and related
 tests

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/compressors/model_free.py          | 206 +++++++++++++++++-
 test/test_cpu/quantization/test_model_free.py |  97 ++++++++-
 2 files changed, 293 insertions(+), 10 deletions(-)

diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py
index bd9df7244..54e773c4c 100644
--- a/auto_round/compressors/model_free.py
+++ b/auto_round/compressors/model_free.py
@@ -74,6 +74,7 @@
 import torch
 
 from auto_round import envs
+from auto_round.compressors.utils import is_mx_fp
 from auto_round.logger import logger
 from auto_round.schemes import PRESET_SCHEMES, QuantizationScheme, preset_name_to_scheme
 from auto_round.utils.common import AUDIO_MM_KEYS, VISION_MM_KEYS, compress_layer_names, to_standard_regex
@@ -101,12 +102,17 @@
     "W4A16",
     "W4A16_MIXED",
     "W8A16",
+    "MXFP4",
+    "MXFP8",
 )
 
 # Allowed ``bits`` values for integer WOQ.
 # 3-bit is excluded — see note above.
 _SUPPORTED_INT_BITS: tuple[int, ...] = (2, 4, 8)
 
+# Allowed ``bits`` values for MXFP weight quantization.
+_SUPPORTED_MXFP_BITS: tuple[int, ...] = (4, 8)
+
 # Multimodal keywords kept in full precision by default.
 _NONTEXT_KEYWORDS: tuple[str, ...] = VISION_MM_KEYS + AUDIO_MM_KEYS
 
@@ -449,6 +455,77 @@ def _resolve_uncached(self, tensor_name: str) -> dict | None:
 # ---------------------------------------------------------------------------
 
 
+def _quantize_weight_mxfp(
+    weight: torch.Tensor,
+    layer_name: str,
+    bits: int,
+    group_size: int,
+    data_type: str,
+    device: str = "cpu",
+) -> dict[str, torch.Tensor]:
+    """Quantize a 2D weight tensor to MXFP4 / MXFP8 and return packed outputs.
+
+    Reuses :func:`auto_round.data_type.mxfp.quant_mx` to derive the per-block
+    shared exponent (E8M0 scale), and :class:`auto_round.export.export_to_autoround.qlinear_fp.QuantLinear`
+    to perform the same packing as :func:`auto_round.export.export_to_llmcompressor.export_to_fp.pack_layer`.
+
+    Returns a dict with one of:
+      * MXFP8: ``{layer_name+'.weight': float8_e4m3fn, layer_name+'.weight_scale': uint8}``
+      * MXFP4: ``{layer_name+'.weight_packed': uint8, layer_name+'.weight_scale': uint8}``
+    """
+    import torch.nn as nn
+
+    from auto_round.data_type.mxfp import quant_mx
+    from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
+
+    if not is_mx_fp(data_type):
+        data_type = "mx_fp"
+
+    out_features, in_features = weight.shape
+    if in_features % group_size != 0:
+        raise ValueError(
+            f"in_features={in_features} for layer '{layer_name}' is not divisible "
+            f"by MXFP group_size={group_size}; cannot pack."
+        )
+
+    weight_dev = weight.to(device)
+    # quant_mx returns (qdq_tensor, shared_exp, None).  We only need shared_exp
+    # (the per-block log2 scale).  The element-wise rounding to the FP4/FP8 grid
+    # is performed inside QuantLinear.pack via dtype casts / pack_fp4_to_uint8.
+    _, shared_exp, _ = quant_mx(weight_dev, bits=bits, group_size=group_size, data_type=data_type)
+    # Reshape to (out_features, n_groups) so the on-disk weight_scale matches
+    # the llm-compressor convention (and QuantLinear's registered buffer shape).
+    shared_exp = shared_exp.reshape(out_features, in_features // group_size)
+
+    # Build a lightweight nn.Linear holding the original weight so we can
+    # delegate packing to the existing QuantLinear.pack implementation.
+    fake_linear = nn.Linear(in_features, out_features, bias=False)
+    with torch.no_grad():
+        fake_linear.weight = nn.Parameter(weight_dev, requires_grad=False)
+
+    qlayer = QuantLinear(
+        bits=bits,
+        group_size=group_size,
+        infeatures=in_features,
+        outfeatures=out_features,
+        bias=False,
+        data_type="mx_fp4" if bits == 4 else "mx_fp8e4m3",
+        sym=True,
+        act_bits=bits,
+    )
+    qlayer.pack(fake_linear, shared_exp, device=device)
+
+    if bits == 8:
+        return {
+            f"{layer_name}.weight": qlayer.weight.to("cpu"),
+            f"{layer_name}.weight_scale": qlayer.weight_scale.to("cpu"),
+        }
+    return {
+        f"{layer_name}.weight_packed": qlayer.weight_packed.to("cpu"),
+        f"{layer_name}.weight_scale": qlayer.weight_scale.to("cpu"),
+    }
+
+
 def _quantize_single_tensor(
     tensor_name: str,
     tensor: torch.Tensor,
@@ -481,10 +558,29 @@ def _quantize_single_tensor(
     bits = scheme["bits"]
     group_size = scheme["group_size"]
     sym = scheme.get("sym", True)
+    data_type = (scheme.get("data_type") or "int").lower()
 
     if bits >= 16:
         return layer_name, {tensor_name: tensor}, None, layer_name
 
+    # ---- MXFP path (MXFP4 / MXFP8) ----
+    if is_mx_fp(data_type):
+        try:
+            out = _quantize_weight_mxfp(
+                weight=tensor,
+                layer_name=layer_name,
+                bits=bits,
+                group_size=group_size,
+                data_type=data_type,
+                device=device,
+            )
+            logger.debug(f"Quantized (MXFP): {layer_name} (bits={bits}, group_size={group_size})")
+            return layer_name, out, layer_name, None
+        except Exception as e:
+            logger.warning(f"Failed to MXFP-quantize {layer_name}: {e}. Keeping original weight.")
+            return layer_name, {tensor_name: tensor}, None, layer_name
+
+    # ---- Integer WOQ path ----
     try:
         qweight, qzeros, scales = quantize_weight_rtn(
             weight=tensor,
@@ -686,7 +782,23 @@ def _process_shard(
             raw_tensors = {name: f.get_tensor(name) for name in f.keys()}
 
     raw_tensors = split_fused_expert_tensors(raw_tensors)
+
+    # Preserve original tensors for ignored/skipped layers so that already-
+    # quantized weights (FP8, FP4-packed, etc.) are NOT dequantized.
+    preserved_prefixes: set[str] = set()
+    for tname in raw_tensors:
+        if tname.endswith(".weight") and (matcher.should_ignore(tname) or matcher.should_skip(tname)):
+            preserved_prefixes.add(tname.rsplit(".", 1)[0])
+
+    preserved_tensors: dict[str, torch.Tensor] = {}
+    if preserved_prefixes:
+        for key in list(raw_tensors.keys()):
+            prefix = key.rsplit(".", 1)[0]
+            if prefix in preserved_prefixes:
+                preserved_tensors[key] = raw_tensors.pop(key)
+
     raw_tensors = _dequant_fp8_tensors(raw_tensors, block_size=fp8_block_size)
+    raw_tensors.update(preserved_tensors)
 
     for tensor_name in list(raw_tensors.keys()):
         tensor = raw_tensors.pop(tensor_name)
@@ -710,6 +822,46 @@ def _process_shard(
 # ---------------------------------------------------------------------------
 
 
+def _build_mxfp_quantization_config(
+    default_scheme: dict,
+    quantized_layers: list[str],
+    ignored_layers: list[str],
+) -> dict:
+    """Build a compressed-tensors / llm-compressor style quantization_config
+    dict for MXFP4 / MXFP8 model-free output.
+
+    Mirrors the per-group format produced by
+    :mod:`auto_round.export.export_to_llmcompressor.export_to_fp`.
+    """
+    from auto_round.export.export_to_llmcompressor.config import (
+        check_compressed_tensors_supported,
+        initialize_quantization,
+    )
+
+    check_compressed_tensors_supported(raise_error=True)
+
+    bits = default_scheme["bits"]
+    if bits not in _SUPPORTED_MXFP_BITS:
+        raise ValueError(f"Unsupported MXFP bits={bits} for model-free output.")
+
+    scheme_name = "MXFP4" if bits == 4 else "MXFP8"
+    fmt = "mxfp4-pack-quantized" if bits == 4 else "mxfp8-quantized"
+
+    # Default ignore list: any layer present in ignored_layers (deduped) that
+    # was NOT quantized.  Always include 'lm_head' if it appears in ignore set.
+    ignore = list(dict.fromkeys(ignored_layers))
+    quant_set = set(quantized_layers)
+    ignore = [n for n in ignore if n not in quant_set]
+    if "lm_head" not in ignore:
+        ignore.append("lm_head")
+
+    qconfig = initialize_quantization(scheme=scheme_name, ignore=ignore)
+    qconfig = qconfig.to_dict()
+    qconfig["format"] = fmt
+    qconfig["provider"] = "auto-round"
+    return qconfig
+
+
 def _build_quantization_config(
     default_scheme: dict,
     layer_config: dict,
@@ -719,6 +871,14 @@ def _build_quantization_config(
     block_name_to_quantize: Optional[list[str]] = None,
 ) -> dict:
     """Build a quantization_config dict compatible with auto-round format."""
+    # MXFP (mx_fp) uses the llm-compressor / compressed-tensors style config.
+    if is_mx_fp((default_scheme.get("data_type") or "int").lower()):
+        return _build_mxfp_quantization_config(
+            default_scheme=default_scheme,
+            quantized_layers=quantized_layers,
+            ignored_layers=ignored_layers,
+        )
+
     from auto_round.version import __version__
 
     scheme_keys = [f.name for f in fields(QuantizationScheme)]
@@ -883,7 +1043,28 @@ def _validate_supported_scheme(
     Model-free only supports integer weight-only quantization (sym/asym),
     packed in the ``auto_round:auto_gptq`` format.
     """
+    data_type = (scheme_obj.data_type or "int").lower()
+    bits = scheme_obj.bits
     act_bits = scheme_obj.act_bits if scheme_obj.act_bits is not None else 16
+
+    # MXFP weight-only path: accept mx_fp data type with bits in {4, 8}.
+    # Activation quantization for MXFP is dynamic at inference time, so the
+    # weight-only RTN path here is independent of act_bits.
+    if is_mx_fp(data_type):
+        if bits is None or bits not in _SUPPORTED_MXFP_BITS:
+            raise ValueError(
+                f"Model-free mode supports MXFP bits in {_SUPPORTED_MXFP_BITS}, "
+                f"but '{scheme_input}' requests bits={bits}. "
+                f"Supported preset schemes: {list(SUPPORTED_PRESET_SCHEMES)}."
+            )
+        group_size = scheme_obj.group_size
+        if group_size not in (None, 32):
+            raise ValueError(
+                f"Model-free mode supports MXFP only with group_size=32, "
+                f"but '{scheme_input}' requests group_size={group_size}."
+            )
+        return
+
     if act_bits < 16:
         raise ValueError(
             f"Model-free mode only supports weight-only quantization (WOQ) schemes "
@@ -891,17 +1072,15 @@ def _validate_supported_scheme(
             f"Supported preset schemes: {list(SUPPORTED_PRESET_SCHEMES)}."
         )
 
-    data_type = (scheme_obj.data_type or "int").lower()
     if data_type != "int":
         raise ValueError(
             f"Model-free mode only supports integer weight quantization "
-            f"(data_type='int'), but '{scheme_input}' has data_type='{data_type}'. "
-            f"FP8 / MXFP / NVFP / GGUF / BF16 schemes require the standard "
-            f"AutoRound flow.  Supported preset schemes: "
+            f"(data_type='int') or MXFP (data_type='mx_fp'), but '{scheme_input}' "
+            f"has data_type='{data_type}'. FP8 / NVFP / GGUF / BF16 schemes require "
+            f"the standard AutoRound flow.  Supported preset schemes: "
             f"{list(SUPPORTED_PRESET_SCHEMES)}."
         )
 
-    bits = scheme_obj.bits
     if bits is None or bits not in _SUPPORTED_INT_BITS:
         raise ValueError(
             f"Model-free mode supports bits in {_SUPPORTED_INT_BITS}, "
@@ -963,7 +1142,12 @@ class _ModelFreeCompressorCore:
             modules are kept in full precision.
     """
 
-    SUPPORTED_FORMATS: tuple[str, ...] = ("auto_round",)
+    SUPPORTED_FORMATS: tuple[str, ...] = (
+        "auto_round",
+        "auto_round:auto_gptq",
+        "llm_compressor",
+        "auto_round:llm_compressor",
+    )
 
     def __init__(
         self,
@@ -1610,7 +1794,15 @@ def quantize_and_save(
         **kwargs,
     ):
         """Quantize and save — AutoRound compressor entry point."""
-        if format not in ["auto_round", "auto_round:auto_gptq"]:
+        # Accept the standard auto_round formats plus llm_compressor variants
+        # (the latter are the natural output for MXFP4 / MXFP8 schemes).
+        _accepted_formats = {
+            "auto_round",
+            "auto_round:auto_gptq",
+            "llm_compressor",
+            "auto_round:llm_compressor",
+        }
+        if format not in _accepted_formats:
             return self._fallback_to_quantize_and_save(output_dir=output_dir, format=format, inplace=inplace, **kwargs)
 
         # Apply user scheme overrides before running
diff --git a/test/test_cpu/quantization/test_model_free.py b/test/test_cpu/quantization/test_model_free.py
index 736b6c910..dc0e6f59e 100644
--- a/test/test_cpu/quantization/test_model_free.py
+++ b/test/test_cpu/quantization/test_model_free.py
@@ -28,6 +28,7 @@
     _ModelFreeCompressorCore,
     _PatternMatcher,
     _process_shard,
+    _quantize_weight_mxfp,
     get_predefined_ignore_layers_from_config,
     is_model_free_supported_scheme,
 )
@@ -254,6 +255,25 @@ def test_process_shard_fp8(self, tmp_path):
         output, quantized, _ = _process_shard(shard_path, _DEFAULT_SCHEME, {}, [], device="cpu", fp8_block_size=None)
         assert "layer" in quantized and "layer.qweight" in output
 
+    def test_ignored_layer_preserves_original_fp8(self, tmp_path):
+        """Ignored layers keep their original quantized tensors (no dequant)."""
+        shard_path = str(tmp_path / "shard.safetensors")
+        w_fp8 = torch.randn(64, 128, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
+        scale = torch.tensor(0.5)
+        save_file(
+            {"lm_head.weight": w_fp8, "lm_head.weight_scale_inv": scale, "layer.weight": torch.randn(64, 128)},
+            shard_path,
+        )
+        output, quantized, ignored = _process_shard(
+            shard_path, _DEFAULT_SCHEME, {}, ["lm_head"], device="cpu", fp8_block_size=None
+        )
+        # lm_head should be ignored and kept in original FP8 format
+        assert "lm_head" in ignored
+        assert output["lm_head.weight"].dtype == torch.float8_e4m3fn
+        assert "lm_head.weight_scale_inv" in output
+        # non-ignored layer should be quantized normally
+        assert "layer" in quantized
+
 
 # ===========================================================================
 #  End-to-end ModelFreeQuantize
@@ -305,13 +325,80 @@ def test_asym(self, tmp_path):
         assert qc["sym"] is False and qc["group_size"] == 64
 
 
+# ===========================================================================
+#  MXFP4 / MXFP8 model-free quantization
+# ===========================================================================
+
+
+class TestModelFreeMXFP:
+    """End-to-end tests for MXFP4/MXFP8 model-free quantization."""
+
+    def test_quantize_weight_mxfp4_shapes(self):
+        w = torch.randn(64, 128, dtype=torch.bfloat16)
+        out = _quantize_weight_mxfp(w, "layer", bits=4, group_size=32, data_type="mx_fp")
+        assert out["layer.weight_packed"].shape == (64, 64)  # in_features / 2
+        assert out["layer.weight_packed"].dtype == torch.uint8
+        assert out["layer.weight_scale"].shape == (64, 4)  # in_features / group_size
+        assert out["layer.weight_scale"].dtype == torch.uint8
+
+    def test_quantize_weight_mxfp8_shapes(self):
+        w = torch.randn(64, 128, dtype=torch.bfloat16)
+        out = _quantize_weight_mxfp(w, "layer", bits=8, group_size=32, data_type="mx_fp")
+        assert out["layer.weight"].shape == (64, 128)
+        assert out["layer.weight"].dtype == torch.float8_e4m3fn
+        assert out["layer.weight_scale"].shape == (64, 4)
+        assert out["layer.weight_scale"].dtype == torch.uint8
+
+    @pytest.mark.parametrize("scheme,fmt", [("MXFP4", "mxfp4-pack-quantized"), ("MXFP8", "mxfp8-quantized")])
+    def test_e2e_mxfp(self, tmp_path, scheme, fmt):
+        tensors = {
+            "model.layers.0.self_attn.q_proj.weight": torch.randn(128, 128),
+            "model.layers.0.fc1.weight": torch.randn(512, 128),
+            "lm_head.weight": torch.randn(1000, 128),
+        }
+        model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, tensors)
+        output_dir = str(tmp_path / "output")
+        _ModelFreeCompressorCore(model_name_or_path=model_dir, output_dir=output_dir, scheme=scheme).run()
+        qc = _read_qconfig(output_dir)
+        assert qc["format"] == fmt
+        assert qc["quant_method"] == "compressed-tensors"
+        assert "lm_head" in qc["ignore"]
+        keys = _read_output_keys(output_dir)
+        # MXFP4 produces weight_packed, MXFP8 produces weight
+        if scheme == "MXFP4":
+            assert "model.layers.0.fc1.weight_packed" in keys
+        else:
+            assert "model.layers.0.fc1.weight" in keys
+        assert "model.layers.0.fc1.weight_scale" in keys
+        # lm_head stays full precision
+        assert "lm_head.weight" in keys
+        assert "lm_head.weight_packed" not in keys
+
+    def test_mxfp4_via_autoround_api(self, tmp_path):
+        tensors = {"model.layers.0.fc1.weight": torch.randn(128, 128)}
+        model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, tensors)
+        output_dir = str(tmp_path / "output")
+        AutoRound(model=model_dir, scheme="MXFP4", model_free=True).quantize_and_save(output_dir)
+        qc = _read_qconfig(output_dir)
+        assert qc["format"] == "mxfp4-pack-quantized"
+
+    def test_process_shard_mxfp(self, tmp_path):
+        shard_path = str(tmp_path / "shard.safetensors")
+        save_file({"layer.fc1.weight": torch.randn(64, 128)}, shard_path)
+        scheme = {"bits": 4, "group_size": 32, "sym": True, "data_type": "mx_fp"}
+        output, quantized, _ = _process_shard(shard_path, scheme, {}, [])
+        assert "layer.fc1" in quantized
+        assert "layer.fc1.weight_packed" in output
+        assert "layer.fc1.weight_scale" in output
+
+
 # ===========================================================================
 #  Scheme validation
 # ===========================================================================
 
 
-_SUPPORTED = ["W2A16", "W2A16G32", "W2A16G64", "W4A16", "W4A16_MIXED", "W8A16"]
-_UNSUPPORTED = ["W3A16", "FPW8A16", "BF16", "MXFP4", "MXFP8", "MXINT4", "NVFP4", "FP8_BLOCK", "FP8_STATIC", "INT8_W8A8"]
+_SUPPORTED = ["W2A16", "W2A16G32", "W2A16G64", "W4A16", "W4A16_MIXED", "W8A16", "MXFP4", "MXFP8"]
+_UNSUPPORTED = ["W3A16", "FPW8A16", "BF16", "MXINT4", "NVFP4", "FP8_BLOCK", "FP8_STATIC", "INT8_W8A8"]
 
 
 class TestSchemeValidation:
@@ -320,7 +407,11 @@ def test_supported(self, tmp_path, name):
         model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, {"model.layers.0.mlp.fc1.weight": torch.randn(64, 128)})
         out = str(tmp_path / f"out_{name}")
         AutoRound(model=model_dir, scheme=name, model_free=True).quantize_and_save(out)
-        assert "model.layers.0.mlp.fc1.qweight" in _read_output_keys(out)
+        keys = _read_output_keys(out)
+        if name.startswith("MXFP"):
+            assert "model.layers.0.mlp.fc1.weight_scale" in keys
+        else:
+            assert "model.layers.0.mlp.fc1.qweight" in keys
 
     @pytest.mark.parametrize("name", _UNSUPPORTED)
     def test_unsupported_raises(self, tmp_path, name):

From aedf04f58efbe35986e2352c318b31b7456b9767 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 28 May 2026 06:27:10 +0000
Subject: [PATCH 2/5] update document

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/compressors/model_free.py | 77 ++++++++++++++++++++++------
 docs/step_by_step.md                 | 29 +++++++++--
 docs/step_by_step_CN.md              | 29 +++++++++--
 3 files changed, 112 insertions(+), 23 deletions(-)

diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py
index 54e773c4c..528cfdb1a 100644
--- a/auto_round/compressors/model_free.py
+++ b/auto_round/compressors/model_free.py
@@ -24,39 +24,67 @@
 
 Supported schemes
 -----------------
-Model-free mode currently supports **integer weight-only** quantization
-schemes packed in the ``auto_round:auto_gptq`` format only.  Specifically:
+Model-free mode supports the following quantization families:
 
-* Preset names: ``W2A16``, ``W2A16G32``, ``W2A16G64``, ``W3A16``, ``W4A16``,
-  ``W8A16``.
+**Integer weight-only** (packed in ``auto_round:auto_gptq`` format):
+
+* Preset names: ``W2A16``, ``W2A16G32``, ``W2A16G64``, ``W4A16``,
+  ``W4A16_MIXED``, ``W8A16``.
 * Custom :class:`~auto_round.schemes.QuantizationScheme` instances with
-  ``data_type="int"``, ``bits in {2, 3, 4, 8}``, ``act_bits >= 16``, and any
+  ``data_type="int"``, ``bits in {2, 4, 8}``, ``act_bits >= 16``, and any
   symmetric / asymmetric configuration.
 
-Schemes that require special packing (FP8, MXFP4, NVFP4, GGUF, INT8_W8A8,
+**MXFP (Microscaling Floating Point)** (packed in ``mxfp4-pack-quantized`` or
+``mxfp8-quantized`` format, compatible with llm-compressor / compressed-tensors):
+
+* Preset names: ``MXFP4``, ``MXFP8``.
+* ``data_type="mx_fp"``, ``group_size=32``, ``bits in {4, 8}``.
+
+Schemes that require special packing (FP8, NVFP4, GGUF, INT8_W8A8,
 BF16, FPW8A16, ...) are **not** supported in model-free mode and will raise
 ``ValueError``.  Use the standard AutoRound flow for those.
 
+Output formats
+--------------
+* **INT schemes** → ``auto_round:auto_gptq`` packing format, ``quant_method="auto-round"``.
+* **MXFP schemes** → ``mxfp4-pack-quantized`` or ``mxfp8-quantized`` format,
+  ``quant_method="compressed-tensors"``, compatible with vLLM / llm-compressor.
+
 Usage (CLI)
 -----------
 ::
 
+    # Integer WOQ
     auto_round facebook/opt-125m \\
         --model_free \\
         --scheme W4A16 \\
         --output_dir int4-125m
 
+    # MXFP4
+    auto_round facebook/opt-125m \\
+        --model_free \\
+        --scheme MXFP4 \\
+        --output_dir mxfp4-125m
+
 Usage (API)
 -----------
 ::
 
     from auto_round import AutoRound
 
+    # Integer WOQ
     AutoRound(
         model="facebook/opt-125m",
         scheme="W4A16",
         model_free=True,
     ).quantize_and_save("./int4-125m")
+
+    # MXFP4
+    AutoRound(
+        model="facebook/opt-125m",
+        scheme="MXFP4",
+        model_free=True,
+    ).quantize_and_save("./mxfp4-125m")
 """
 
 from __future__ import annotations
@@ -88,9 +116,9 @@
 # add "embed", "conv" in case of auto detection failure in _check_conv1d_and_embedding
 _BLOCK_NAME_TO_IGNORE = ["shared_expert_gate.", ".gate.", "embed", "conv"]
 
-# Integer WOQ preset schemes that model-free mode can produce.
-# Other presets (FP8/MX/NV/GGUF/BF16/INT8_W8A8/FPW8A16) require different
-# packing kernels not implemented by ``quantize_weight_rtn``.
+# Preset schemes that model-free mode can produce.
+# INT presets use ``auto_round:auto_gptq`` packing; MXFP presets use
+# ``mxfp4-pack-quantized`` or ``mxfp8-quantized`` (compressed-tensors) packing.
 #
 # Note: ``W3A16`` (3-bit) is intentionally excluded.  3-bit packing requires
 # in_features to be padded to a multiple of pack_factor=10, which the current
@@ -1040,8 +1068,13 @@ def _validate_supported_scheme(
 ) -> None:
     """Raise ``ValueError`` if *scheme_obj* is not supported by model-free.
 
-    Model-free only supports integer weight-only quantization (sym/asym),
-    packed in the ``auto_round:auto_gptq`` format.
+    Model-free supports:
+
+    * Integer weight-only quantization (sym/asym), ``bits ∈ {2, 4, 8}``,
+      packed in the ``auto_round:auto_gptq`` format.
+    * MXFP weight quantization (``data_type='mx_fp'``), ``bits ∈ {4, 8}``,
+      ``group_size=32``, packed in ``mxfp4-pack-quantized`` / ``mxfp8-quantized``
+      format (compressed-tensors compatible).
     """
     data_type = (scheme_obj.data_type or "int").lower()
     bits = scheme_obj.bits
@@ -1124,14 +1157,19 @@ class _ModelFreeCompressorCore:
     Args:
         model_name_or_path: HuggingFace model ID or local directory path.
         output_dir: Directory to save the quantized model.
-        scheme: Quantization scheme name (e.g. ``"W4A16"``) or a
-            :class:`QuantizationScheme` instance.
+        scheme: Quantization scheme name (e.g. ``"W4A16"``, ``"MXFP4"``,
+            ``"MXFP8"``) or a :class:`QuantizationScheme` instance.
         layer_config: Per-layer quantization overrides.  Keys are layer
             names or regex patterns; values are dicts with ``bits``,
             ``group_size``, ``sym`` etc.
         ignore_layers: Comma-separated list of layer name patterns to keep
-            in full precision.
-        format: Output format (only ``"auto_round"`` is supported).
+            in full precision.  Ignored layers that are already quantized
+            (e.g. FP8) are preserved in their original format.
+        format: Output format.  Supported: ``"auto_round"``,
+            ``"auto_round:auto_gptq"``, ``"llm_compressor"``,
+            ``"auto_round:llm_compressor"``.  The packing format is
+            auto-selected based on the scheme (INT→auto_gptq,
+            MXFP→compressed-tensors).
         device: Device for quantization computation (``"cpu"`` or
             ``"cuda"``).
         quant_lm_head: If True, quantize ``lm_head`` as well.  By default
@@ -1572,9 +1610,18 @@ def run(self) -> str:
         self._detect_fp8_source()
         self._discover_shards()
 
+        # Determine the output packing format based on scheme data type
+        data_type = (self.default_scheme.get("data_type") or "int").lower()
+        if is_mx_fp(data_type):
+            bits = self.default_scheme.get("bits", 4)
+            packing_format = "mxfp4-pack-quantized" if bits == 4 else "mxfp8-quantized"
+        else:
+            packing_format = "auto_round:auto_gptq"
+
         logger.info(
             f"Model-free quantization: {self.model_name_or_path}\n"
             f"  Scheme: {self.scheme_obj}\n"
+            f"  Packing format: {packing_format}\n"
             f"  Output: {self.output_dir}\n"
             f"  Shards: {len(self.shard_names)}\n"
             f"  Streaming download: {self.is_streaming}\n"
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
index 4a2a8052d..a321e24b1 100644
--- a/docs/step_by_step.md
+++ b/docs/step_by_step.md
@@ -523,7 +523,7 @@ ar.quantize_and_save(output_dir, format="auto_round")
 
 Model-free mode performs RTN WOQ quantization **without loading the full model into memory**. It downloads safetensors files directly, quantizes each Linear weight tensor shard-by-shard, and saves the packed result. This is useful when you want fast, no-calibration quantization with minimal resource requirements.
 
-> **Auto-enabled by default.** As of v0.13, when you pass `--iters 0 --disable_opt_rtn` together with a supported INT WOQ scheme, the CLI automatically takes the model-free path.  This is **bit-exactly equivalent** to the regular `--iters 0 --disable_opt_rtn` flow but uses far less memory.  Use `--disable_model_free` to opt out and force the original flow.
+> **Auto-enabled by default.** As of v0.13, when you pass `--iters 0 --disable_opt_rtn` together with a supported INT WOQ or MXFP scheme, the CLI automatically takes the model-free path.  This is **bit-exactly equivalent** to the regular `--iters 0 --disable_opt_rtn` flow but uses far less memory.  Use `--disable_model_free` to opt out and force the original flow.
 
 **Key features:**
 - **No model object required** – only `config.json` and safetensors files are needed
@@ -537,7 +537,9 @@ Model-free mode performs RTN WOQ quantization **without loading the full model i
 
 **Supported schemes**
 
-Model-free mode currently supports the following **integer weight-only** preset schemes (packed in the `auto_round:auto_gptq` format):
+Model-free mode supports the following preset schemes:
+
+**Integer weight-only** (packed in `auto_round:auto_gptq` format):
 
 | Preset | Bits | Group size | Sym |
 | --- | --- | --- | --- |
@@ -552,7 +554,14 @@ All of the above presets also support **asymmetric quantization** (`sym=False`)
 
 You can also pass a custom `QuantizationScheme(bits=N, group_size=G, sym=True/False, data_type="int", act_bits=16)` with `bits ∈ {2, 4, 8}` and any group_size / sym configuration.
 
-Schemes that require special packing kernels (`W3A16`, `FPW8A16`, `BF16`, `MXFP4`, `MXFP8`, `MXINT4`, `NVFP4`, `FP8_BLOCK`, `FP8_STATIC`, `INT8_W8A8`, `GGUF:*`, ...) are **not** supported in model-free mode and will raise `ValueError`.  Use the regular AutoRound flow for those.
+**MXFP (Microscaling Floating Point)** (packed in `mxfp4-pack-quantized` / `mxfp8-quantized` format, compatible with compressed-tensors / vLLM):
+
+| Preset | Bits | Group size | Format |
+| --- | --- | --- | --- |
+| `MXFP4` | 4 | 32 | mxfp4-pack-quantized |
+| `MXFP8` | 8 | 32 | mxfp8-quantized |
+
+Schemes that require special packing kernels (`W3A16`, `FPW8A16`, `BF16`, `MXINT4`, `NVFP4`, `FP8_BLOCK`, `FP8_STATIC`, `INT8_W8A8`, `GGUF:*`, ...) are **not** supported in model-free mode and will raise `ValueError`.  Use the regular AutoRound flow for those.
 
 #### CLI Usage
 
@@ -584,6 +593,18 @@ auto_round meta-llama/Llama-3.2-1B-Instruct \
   --layer_config "{k_proj:{bits:8},v_proj:{bits:8}}" \
   --ignore_layers "mlp" \
   --output_dir ./int4-llama
+
+# MXFP4 quantization
+auto_round meta-llama/Llama-3.2-1B-Instruct \
+  --model_free \
+  --scheme MXFP4 \
+  --output_dir ./mxfp4-llama
+
+# MXFP8 quantization
+auto_round meta-llama/Llama-3.2-1B-Instruct \
+  --model_free \
+  --scheme MXFP8 \
+  --output_dir ./mxfp8-llama
 ```
 
 #### API Usage
@@ -603,7 +624,7 @@ AutoRound(
 ).quantize_and_save("./int4-llama")
 ```
 
-> **Note:** Model-free mode only supports the `auto_round` output format and uses RTN (no calibration data, no iterative tuning).  For higher-quality quantization or schemes outside the supported list, use the standard AutoRound flow.
+> **Note:** Model-free mode uses RTN (no calibration data, no iterative tuning).  INT schemes output in `auto_round:auto_gptq` format; MXFP schemes output in compressed-tensors format (`mxfp4-pack-quantized` / `mxfp8-quantized`).  For higher-quality quantization or schemes outside the supported list, use the standard AutoRound flow.
 
 </details>
 
diff --git a/docs/step_by_step_CN.md b/docs/step_by_step_CN.md
index cf4347df0..93a375a3b 100644
--- a/docs/step_by_step_CN.md
+++ b/docs/step_by_step_CN.md
@@ -520,7 +520,7 @@ ar.quantize_and_save(output_dir, format="auto_round")
 
 免模型架构量化模式（Model-Free Mode）可以**无需将完整模型加载到内存中**即可执行 RTN WOQ 量化。它直接下载 safetensors 文件，逐分片地对每个 Linear 权重张量进行量化并保存打包结果。当您需要快速、无标定数据的量化且资源有限时，该模式非常实用。
 
-> **默认自动启用。** 自 v0.13 起，当您同时传入 `--iters 0 --disable_opt_rtn` 与一个受支持的 INT WOQ scheme 时，CLI 会自动走免模型路径。该路径与原始 `--iters 0 --disable_opt_rtn` 流程**位级（bit-exact）等价**，但内存占用大幅降低。如需关闭自动路由、强制使用原始流程，可加 `--disable_model_free`。
+> **默认自动启用。** 自 v0.13 起，当您同时传入 `--iters 0 --disable_opt_rtn` 与一个受支持的 INT WOQ 或 MXFP scheme 时，CLI 会自动走免模型路径。该路径与原始 `--iters 0 --disable_opt_rtn` 流程**位级（bit-exact）等价**，但内存占用大幅降低。如需关闭自动路由、强制使用原始流程，可加 `--disable_model_free`。
 
 **主要特性：**
 - **无需模型对象** — 仅需 `config.json` 和 safetensors 文件
@@ -534,7 +534,9 @@ ar.quantize_and_save(output_dir, format="auto_round")
 
 **支持的 Scheme**
 
-免模型模式当前支持以下整数权重量化预设（均使用 `auto_round:auto_gptq` 打包格式）：
+免模型模式支持以下量化预设：
+
+**整数权重量化**（使用 `auto_round:auto_gptq` 打包格式）：
 
 | Preset | Bits | Group size | Sym |
 | --- | --- | --- | --- |
@@ -549,7 +551,14 @@ ar.quantize_and_save(output_dir, format="auto_round")
 
 也可以传入自定义的 `QuantizationScheme(bits=N, group_size=G, sym=True/False, data_type="int", act_bits=16)`，其中 `bits ∈ {2, 4, 8}`，group_size / sym 可任意设置。
 
-需要特殊打包内核的 scheme（`W3A16`、`FPW8A16`、`BF16`、`MXFP4`、`MXFP8`、`MXINT4`、`NVFP4`、`FP8_BLOCK`、`FP8_STATIC`、`INT8_W8A8`、`GGUF:*` 等）**不被支持**，传入会抛 `ValueError`。这些请使用标准 AutoRound 流程。
+**MXFP（微缩放浮点）**（使用 `mxfp4-pack-quantized` / `mxfp8-quantized` 格式，兼容 compressed-tensors / vLLM）：
+
+| Preset | Bits | Group size | 格式 |
+| --- | --- | --- | --- |
+| `MXFP4` | 4 | 32 | mxfp4-pack-quantized |
+| `MXFP8` | 8 | 32 | mxfp8-quantized |
+
+需要特殊打包内核的 scheme（`W3A16`、`FPW8A16`、`BF16`、`MXINT4`、`NVFP4`、`FP8_BLOCK`、`FP8_STATIC`、`INT8_W8A8`、`GGUF:*` 等）**不被支持**，传入会抛 `ValueError`。这些请使用标准 AutoRound 流程。
 
 #### 命令行用法
 
@@ -581,6 +590,18 @@ auto_round meta-llama/Llama-3.2-1B-Instruct \
   --layer_config "{k_proj:{bits:8},v_proj:{bits:8}}" \
   --ignore_layers "mlp" \
   --output_dir ./int4-llama
+
+# MXFP4 量化
+auto_round meta-llama/Llama-3.2-1B-Instruct \
+  --model_free \
+  --scheme MXFP4 \
+  --output_dir ./mxfp4-llama
+
+# MXFP8 量化
+auto_round meta-llama/Llama-3.2-1B-Instruct \
+  --model_free \
+  --scheme MXFP8 \
+  --output_dir ./mxfp8-llama
 ```
 
 #### API 用法
@@ -600,7 +621,7 @@ AutoRound(
 ).quantize_and_save("./int4-llama")
 ```
 
-> **注意：** 免模型量化模式仅支持 `auto_round` 输出格式，并使用 RTN（无标定数据、无迭代调优）。如需更高质量的量化结果或使用受支持列表外的 scheme，请使用标准 AutoRound 流程。
+> **注意：** 免模型量化模式使用 RTN（无标定数据、无迭代调优）。INT scheme 输出为 `auto_round:auto_gptq` 格式；MXFP scheme 输出为 compressed-tensors 格式（`mxfp4-pack-quantized` / `mxfp8-quantized`）。如需更高质量的量化结果或使用受支持列表外的 scheme，请使用标准 AutoRound 流程。
 
 </details>
 

From 6c3019d04a00d452630fea4cbc22fee488149b97 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 29 May 2026 13:10:09 +0800
Subject: [PATCH 3/5] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 auto_round/compressors/model_free.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py
index 528cfdb1a..4cc27d1f0 100644
--- a/auto_round/compressors/model_free.py
+++ b/auto_round/compressors/model_free.py
@@ -876,12 +876,10 @@ def _build_mxfp_quantization_config(
     fmt = "mxfp4-pack-quantized" if bits == 4 else "mxfp8-quantized"
 
     # Default ignore list: any layer present in ignored_layers (deduped) that
-    # was NOT quantized.  Always include 'lm_head' if it appears in ignore set.
+    # was NOT quantized.
     ignore = list(dict.fromkeys(ignored_layers))
     quant_set = set(quantized_layers)
     ignore = [n for n in ignore if n not in quant_set]
-    if "lm_head" not in ignore:
-        ignore.append("lm_head")
 
     qconfig = initialize_quantization(scheme=scheme_name, ignore=ignore)
     qconfig = qconfig.to_dict()

From 82e0962a32dff3124d1d8fa688de685a1669b4f2 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 29 May 2026 13:42:55 +0800
Subject: [PATCH 4/5] update per comments

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/compressors/model_free.py          | 19 ++++++++++++++++--
 test/test_cpu/quantization/test_model_free.py | 20 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py
index 4cc27d1f0..74397f1e0 100644
--- a/auto_round/compressors/model_free.py
+++ b/auto_round/compressors/model_free.py
@@ -813,9 +813,14 @@ def _process_shard(
 
     # Preserve original tensors for ignored/skipped layers so that already-
     # quantized weights (FP8, FP4-packed, etc.) are NOT dequantized.
+    # Check both ".weight" and ".weight_packed" so that layers whose primary
+    # tensor uses non-standard naming (e.g. already-quantized FP4-packed layers
+    # stored as ".weight_packed") are correctly captured.
     preserved_prefixes: set[str] = set()
     for tname in raw_tensors:
-        if tname.endswith(".weight") and (matcher.should_ignore(tname) or matcher.should_skip(tname)):
+        if (tname.endswith(".weight") or tname.endswith(".weight_packed") or tname.endswith(".qweight")) and (
+            matcher.should_ignore(tname) or matcher.should_skip(tname)
+        ):
             preserved_prefixes.add(tname.rsplit(".", 1)[0])
 
     preserved_tensors: dict[str, torch.Tensor] = {}
@@ -1082,6 +1087,16 @@ def _validate_supported_scheme(
     # Activation quantization for MXFP is dynamic at inference time, so the
     # weight-only RTN path here is independent of act_bits.
     if is_mx_fp(data_type):
+        # Restrict to the two explicitly supported MXFP presets when a string
+        # name is provided.  Variants such as MXFP4_RCEIL / MXFP8_RCEIL use a
+        # different activation format; silently mapping them to "MXFP4" /
+        # "MXFP8" in the output config would misrepresent the requested scheme.
+        if isinstance(scheme_input, str) and scheme_input not in ("MXFP4", "MXFP8"):
+            raise ValueError(
+                f"Model-free mode only supports MXFP preset names 'MXFP4' and 'MXFP8', "
+                f"but got '{scheme_input}'. "
+                f"Supported preset schemes: {list(SUPPORTED_PRESET_SCHEMES)}."
+            )
         if bits is None or bits not in _SUPPORTED_MXFP_BITS:
             raise ValueError(
                 f"Model-free mode supports MXFP bits in {_SUPPORTED_MXFP_BITS}, "
@@ -1130,7 +1145,7 @@ def is_model_free_supported_scheme(
     """
     try:
         scheme_obj = _apply_scheme_overrides(scheme, scheme_overrides)
-        _validate_supported_scheme(scheme_obj, scheme_obj)
+        _validate_supported_scheme(scheme_obj, scheme)
         return True
     except (ValueError, TypeError):
         return False
diff --git a/test/test_cpu/quantization/test_model_free.py b/test/test_cpu/quantization/test_model_free.py
index dc0e6f59e..ea3cbf21d 100644
--- a/test/test_cpu/quantization/test_model_free.py
+++ b/test/test_cpu/quantization/test_model_free.py
@@ -34,6 +34,8 @@
 )
 from auto_round.schemes import QuantizationScheme
 
+from ...envs import require_compressed_tensors
+
 # ---------------------------------------------------------------------------
 #  Helpers
 # ---------------------------------------------------------------------------
@@ -349,6 +351,7 @@ def test_quantize_weight_mxfp8_shapes(self):
         assert out["layer.weight_scale"].shape == (64, 4)
         assert out["layer.weight_scale"].dtype == torch.uint8
 
+    @require_compressed_tensors
     @pytest.mark.parametrize("scheme,fmt", [("MXFP4", "mxfp4-pack-quantized"), ("MXFP8", "mxfp8-quantized")])
     def test_e2e_mxfp(self, tmp_path, scheme, fmt):
         tensors = {
@@ -374,6 +377,7 @@ def test_e2e_mxfp(self, tmp_path, scheme, fmt):
         assert "lm_head.weight" in keys
         assert "lm_head.weight_packed" not in keys
 
+    @require_compressed_tensors
     def test_mxfp4_via_autoround_api(self, tmp_path):
         tensors = {"model.layers.0.fc1.weight": torch.randn(128, 128)}
         model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, tensors)
@@ -382,6 +386,7 @@ def test_mxfp4_via_autoround_api(self, tmp_path):
         qc = _read_qconfig(output_dir)
         assert qc["format"] == "mxfp4-pack-quantized"
 
+    @require_compressed_tensors
     def test_process_shard_mxfp(self, tmp_path):
         shard_path = str(tmp_path / "shard.safetensors")
         save_file({"layer.fc1.weight": torch.randn(64, 128)}, shard_path)
@@ -398,12 +403,25 @@ def test_process_shard_mxfp(self, tmp_path):
 
 
 _SUPPORTED = ["W2A16", "W2A16G32", "W2A16G64", "W4A16", "W4A16_MIXED", "W8A16", "MXFP4", "MXFP8"]
-_UNSUPPORTED = ["W3A16", "FPW8A16", "BF16", "MXINT4", "NVFP4", "FP8_BLOCK", "FP8_STATIC", "INT8_W8A8"]
+_UNSUPPORTED = [
+    "W3A16",
+    "FPW8A16",
+    "BF16",
+    "MXINT4",
+    "NVFP4",
+    "FP8_BLOCK",
+    "FP8_STATIC",
+    "INT8_W8A8",
+    "MXFP4_RCEIL",
+    "MXFP8_RCEIL",
+]
 
 
 class TestSchemeValidation:
     @pytest.mark.parametrize("name", _SUPPORTED)
     def test_supported(self, tmp_path, name):
+        if name.startswith("MXFP"):
+            pytest.importorskip("compressed_tensors", reason="test requires compressed-tensors")
         model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, {"model.layers.0.mlp.fc1.weight": torch.randn(64, 128)})
         out = str(tmp_path / f"out_{name}")
         AutoRound(model=model_dir, scheme=name, model_free=True).quantize_and_save(out)

From 87a12cb996cb8caa1480e6f1e5a4c42310bc8301 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 29 May 2026 16:03:07 +0800
Subject: [PATCH 5/5] fix segmentation failure

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../export/test_auto_round_format.py          | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py
index d2b2f6814..4bd85f250 100644
--- a/test/test_cuda/export/test_auto_round_format.py
+++ b/test/test_cuda/export/test_auto_round_format.py
@@ -39,7 +39,7 @@ def _save_dir(self, tmp_path):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_greater_than_050
-    @pytest.mark.parametrize("bits", [2, 3, 4, 8])
+    @pytest.mark.parametrize("bits", [2, 4, 8])
     @pytest.mark.parametrize("group_size", [32, 128])
     @pytest.mark.parametrize("is_sym", [True, False])
     def test_autoround_format(self, tiny_opt_model_path, bits, group_size, is_sym):
@@ -59,6 +59,28 @@ def test_autoround_format(self, tiny_opt_model_path, bits, group_size, is_sym):
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", trust_remote_code=True)
         assert isinstance(model, torch.nn.Module), "Loaded model is not an instance of torch.nn.Module"
 
+    # Split 3 bits test with [2,4,8] bits to avoid segmentation fault
+    @require_greater_than_050
+    @pytest.mark.parametrize("bits", [3])
+    @pytest.mark.parametrize("group_size", [32, 128])
+    @pytest.mark.parametrize("is_sym", [True, False])
+    def test_autoround_format_3bit(self, tiny_opt_model_path, bits, group_size, is_sym):
+        autoround = AutoRound(
+            tiny_opt_model_path,
+            bits=bits,
+            group_size=group_size,
+            sym=is_sym,
+            iters=0,
+            disable_opt_rtn=True,
+        )
+        quantized_model_path = self.save_dir
+
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
+        # Verify loading
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", trust_remote_code=True)
+        assert isinstance(model, torch.nn.Module), "Loaded model is not an instance of torch.nn.Module"
+
     @pytest.mark.skip_ci(reason="Time-consuming; Accuracy evaluation")
     @require_autogptq
     def test_mixed_precision(self):