From 6618d371492bebec9afcee5a249c44ba1e951ba3 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 28 May 2026 05:47:03 +0000 Subject: [PATCH 1/5] feat: add MXFP4/MXFP8 quantization support and related tests Signed-off-by: Xin He --- auto_round/compressors/model_free.py | 206 +++++++++++++++++- test/test_cpu/quantization/test_model_free.py | 97 ++++++++- 2 files changed, 293 insertions(+), 10 deletions(-) diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py index bd9df7244..54e773c4c 100644 --- a/auto_round/compressors/model_free.py +++ b/auto_round/compressors/model_free.py @@ -74,6 +74,7 @@ import torch from auto_round import envs +from auto_round.compressors.utils import is_mx_fp from auto_round.logger import logger from auto_round.schemes import PRESET_SCHEMES, QuantizationScheme, preset_name_to_scheme from auto_round.utils.common import AUDIO_MM_KEYS, VISION_MM_KEYS, compress_layer_names, to_standard_regex @@ -101,12 +102,17 @@ "W4A16", "W4A16_MIXED", "W8A16", + "MXFP4", + "MXFP8", ) # Allowed ``bits`` values for integer WOQ. # 3-bit is excluded — see note above. _SUPPORTED_INT_BITS: tuple[int, ...] = (2, 4, 8) +# Allowed ``bits`` values for MXFP weight quantization. +_SUPPORTED_MXFP_BITS: tuple[int, ...] = (4, 8) + # Multimodal keywords kept in full precision by default. _NONTEXT_KEYWORDS: tuple[str, ...] = VISION_MM_KEYS + AUDIO_MM_KEYS @@ -449,6 +455,77 @@ def _resolve_uncached(self, tensor_name: str) -> dict | None: # --------------------------------------------------------------------------- +def _quantize_weight_mxfp( + weight: torch.Tensor, + layer_name: str, + bits: int, + group_size: int, + data_type: str, + device: str = "cpu", +) -> dict[str, torch.Tensor]: + """Quantize a 2D weight tensor to MXFP4 / MXFP8 and return packed outputs. + + Reuses :func:`auto_round.data_type.mxfp.quant_mx` to derive the per-block + shared exponent (E8M0 scale), and :class:`auto_round.export.export_to_autoround.qlinear_fp.QuantLinear` + to perform the same packing as :func:`auto_round.export.export_to_llmcompressor.export_to_fp.pack_layer`. + + Returns a dict with one of: + * MXFP8: ``{layer_name+'.weight': float8_e4m3fn, layer_name+'.weight_scale': uint8}`` + * MXFP4: ``{layer_name+'.weight_packed': uint8, layer_name+'.weight_scale': uint8}`` + """ + import torch.nn as nn + + from auto_round.data_type.mxfp import quant_mx + from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear + + if not is_mx_fp(data_type): + data_type = "mx_fp" + + out_features, in_features = weight.shape + if in_features % group_size != 0: + raise ValueError( + f"in_features={in_features} for layer '{layer_name}' is not divisible " + f"by MXFP group_size={group_size}; cannot pack." + ) + + weight_dev = weight.to(device) + # quant_mx returns (qdq_tensor, shared_exp, None). We only need shared_exp + # (the per-block log2 scale). The element-wise rounding to the FP4/FP8 grid + # is performed inside QuantLinear.pack via dtype casts / pack_fp4_to_uint8. + _, shared_exp, _ = quant_mx(weight_dev, bits=bits, group_size=group_size, data_type=data_type) + # Reshape to (out_features, n_groups) so the on-disk weight_scale matches + # the llm-compressor convention (and QuantLinear's registered buffer shape). + shared_exp = shared_exp.reshape(out_features, in_features // group_size) + + # Build a lightweight nn.Linear holding the original weight so we can + # delegate packing to the existing QuantLinear.pack implementation. + fake_linear = nn.Linear(in_features, out_features, bias=False) + with torch.no_grad(): + fake_linear.weight = nn.Parameter(weight_dev, requires_grad=False) + + qlayer = QuantLinear( + bits=bits, + group_size=group_size, + infeatures=in_features, + outfeatures=out_features, + bias=False, + data_type="mx_fp4" if bits == 4 else "mx_fp8e4m3", + sym=True, + act_bits=bits, + ) + qlayer.pack(fake_linear, shared_exp, device=device) + + if bits == 8: + return { + f"{layer_name}.weight": qlayer.weight.to("cpu"), + f"{layer_name}.weight_scale": qlayer.weight_scale.to("cpu"), + } + return { + f"{layer_name}.weight_packed": qlayer.weight_packed.to("cpu"), + f"{layer_name}.weight_scale": qlayer.weight_scale.to("cpu"), + } + + def _quantize_single_tensor( tensor_name: str, tensor: torch.Tensor, @@ -481,10 +558,29 @@ def _quantize_single_tensor( bits = scheme["bits"] group_size = scheme["group_size"] sym = scheme.get("sym", True) + data_type = (scheme.get("data_type") or "int").lower() if bits >= 16: return layer_name, {tensor_name: tensor}, None, layer_name + # ---- MXFP path (MXFP4 / MXFP8) ---- + if is_mx_fp(data_type): + try: + out = _quantize_weight_mxfp( + weight=tensor, + layer_name=layer_name, + bits=bits, + group_size=group_size, + data_type=data_type, + device=device, + ) + logger.debug(f"Quantized (MXFP): {layer_name} (bits={bits}, group_size={group_size})") + return layer_name, out, layer_name, None + except Exception as e: + logger.warning(f"Failed to MXFP-quantize {layer_name}: {e}. Keeping original weight.") + return layer_name, {tensor_name: tensor}, None, layer_name + + # ---- Integer WOQ path ---- try: qweight, qzeros, scales = quantize_weight_rtn( weight=tensor, @@ -686,7 +782,23 @@ def _process_shard( raw_tensors = {name: f.get_tensor(name) for name in f.keys()} raw_tensors = split_fused_expert_tensors(raw_tensors) + + # Preserve original tensors for ignored/skipped layers so that already- + # quantized weights (FP8, FP4-packed, etc.) are NOT dequantized. + preserved_prefixes: set[str] = set() + for tname in raw_tensors: + if tname.endswith(".weight") and (matcher.should_ignore(tname) or matcher.should_skip(tname)): + preserved_prefixes.add(tname.rsplit(".", 1)[0]) + + preserved_tensors: dict[str, torch.Tensor] = {} + if preserved_prefixes: + for key in list(raw_tensors.keys()): + prefix = key.rsplit(".", 1)[0] + if prefix in preserved_prefixes: + preserved_tensors[key] = raw_tensors.pop(key) + raw_tensors = _dequant_fp8_tensors(raw_tensors, block_size=fp8_block_size) + raw_tensors.update(preserved_tensors) for tensor_name in list(raw_tensors.keys()): tensor = raw_tensors.pop(tensor_name) @@ -710,6 +822,46 @@ def _process_shard( # --------------------------------------------------------------------------- +def _build_mxfp_quantization_config( + default_scheme: dict, + quantized_layers: list[str], + ignored_layers: list[str], +) -> dict: + """Build a compressed-tensors / llm-compressor style quantization_config + dict for MXFP4 / MXFP8 model-free output. + + Mirrors the per-group format produced by + :mod:`auto_round.export.export_to_llmcompressor.export_to_fp`. + """ + from auto_round.export.export_to_llmcompressor.config import ( + check_compressed_tensors_supported, + initialize_quantization, + ) + + check_compressed_tensors_supported(raise_error=True) + + bits = default_scheme["bits"] + if bits not in _SUPPORTED_MXFP_BITS: + raise ValueError(f"Unsupported MXFP bits={bits} for model-free output.") + + scheme_name = "MXFP4" if bits == 4 else "MXFP8" + fmt = "mxfp4-pack-quantized" if bits == 4 else "mxfp8-quantized" + + # Default ignore list: any layer present in ignored_layers (deduped) that + # was NOT quantized. Always include 'lm_head' if it appears in ignore set. + ignore = list(dict.fromkeys(ignored_layers)) + quant_set = set(quantized_layers) + ignore = [n for n in ignore if n not in quant_set] + if "lm_head" not in ignore: + ignore.append("lm_head") + + qconfig = initialize_quantization(scheme=scheme_name, ignore=ignore) + qconfig = qconfig.to_dict() + qconfig["format"] = fmt + qconfig["provider"] = "auto-round" + return qconfig + + def _build_quantization_config( default_scheme: dict, layer_config: dict, @@ -719,6 +871,14 @@ def _build_quantization_config( block_name_to_quantize: Optional[list[str]] = None, ) -> dict: """Build a quantization_config dict compatible with auto-round format.""" + # MXFP (mx_fp) uses the llm-compressor / compressed-tensors style config. + if is_mx_fp((default_scheme.get("data_type") or "int").lower()): + return _build_mxfp_quantization_config( + default_scheme=default_scheme, + quantized_layers=quantized_layers, + ignored_layers=ignored_layers, + ) + from auto_round.version import __version__ scheme_keys = [f.name for f in fields(QuantizationScheme)] @@ -883,7 +1043,28 @@ def _validate_supported_scheme( Model-free only supports integer weight-only quantization (sym/asym), packed in the ``auto_round:auto_gptq`` format. """ + data_type = (scheme_obj.data_type or "int").lower() + bits = scheme_obj.bits act_bits = scheme_obj.act_bits if scheme_obj.act_bits is not None else 16 + + # MXFP weight-only path: accept mx_fp data type with bits in {4, 8}. + # Activation quantization for MXFP is dynamic at inference time, so the + # weight-only RTN path here is independent of act_bits. + if is_mx_fp(data_type): + if bits is None or bits not in _SUPPORTED_MXFP_BITS: + raise ValueError( + f"Model-free mode supports MXFP bits in {_SUPPORTED_MXFP_BITS}, " + f"but '{scheme_input}' requests bits={bits}. " + f"Supported preset schemes: {list(SUPPORTED_PRESET_SCHEMES)}." + ) + group_size = scheme_obj.group_size + if group_size not in (None, 32): + raise ValueError( + f"Model-free mode supports MXFP only with group_size=32, " + f"but '{scheme_input}' requests group_size={group_size}." + ) + return + if act_bits < 16: raise ValueError( f"Model-free mode only supports weight-only quantization (WOQ) schemes " @@ -891,17 +1072,15 @@ def _validate_supported_scheme( f"Supported preset schemes: {list(SUPPORTED_PRESET_SCHEMES)}." ) - data_type = (scheme_obj.data_type or "int").lower() if data_type != "int": raise ValueError( f"Model-free mode only supports integer weight quantization " - f"(data_type='int'), but '{scheme_input}' has data_type='{data_type}'. " - f"FP8 / MXFP / NVFP / GGUF / BF16 schemes require the standard " - f"AutoRound flow. Supported preset schemes: " + f"(data_type='int') or MXFP (data_type='mx_fp'), but '{scheme_input}' " + f"has data_type='{data_type}'. FP8 / NVFP / GGUF / BF16 schemes require " + f"the standard AutoRound flow. Supported preset schemes: " f"{list(SUPPORTED_PRESET_SCHEMES)}." ) - bits = scheme_obj.bits if bits is None or bits not in _SUPPORTED_INT_BITS: raise ValueError( f"Model-free mode supports bits in {_SUPPORTED_INT_BITS}, " @@ -963,7 +1142,12 @@ class _ModelFreeCompressorCore: modules are kept in full precision. """ - SUPPORTED_FORMATS: tuple[str, ...] = ("auto_round",) + SUPPORTED_FORMATS: tuple[str, ...] = ( + "auto_round", + "auto_round:auto_gptq", + "llm_compressor", + "auto_round:llm_compressor", + ) def __init__( self, @@ -1610,7 +1794,15 @@ def quantize_and_save( **kwargs, ): """Quantize and save — AutoRound compressor entry point.""" - if format not in ["auto_round", "auto_round:auto_gptq"]: + # Accept the standard auto_round formats plus llm_compressor variants + # (the latter are the natural output for MXFP4 / MXFP8 schemes). + _accepted_formats = { + "auto_round", + "auto_round:auto_gptq", + "llm_compressor", + "auto_round:llm_compressor", + } + if format not in _accepted_formats: return self._fallback_to_quantize_and_save(output_dir=output_dir, format=format, inplace=inplace, **kwargs) # Apply user scheme overrides before running diff --git a/test/test_cpu/quantization/test_model_free.py b/test/test_cpu/quantization/test_model_free.py index 736b6c910..dc0e6f59e 100644 --- a/test/test_cpu/quantization/test_model_free.py +++ b/test/test_cpu/quantization/test_model_free.py @@ -28,6 +28,7 @@ _ModelFreeCompressorCore, _PatternMatcher, _process_shard, + _quantize_weight_mxfp, get_predefined_ignore_layers_from_config, is_model_free_supported_scheme, ) @@ -254,6 +255,25 @@ def test_process_shard_fp8(self, tmp_path): output, quantized, _ = _process_shard(shard_path, _DEFAULT_SCHEME, {}, [], device="cpu", fp8_block_size=None) assert "layer" in quantized and "layer.qweight" in output + def test_ignored_layer_preserves_original_fp8(self, tmp_path): + """Ignored layers keep their original quantized tensors (no dequant).""" + shard_path = str(tmp_path / "shard.safetensors") + w_fp8 = torch.randn(64, 128, dtype=torch.bfloat16).to(torch.float8_e4m3fn) + scale = torch.tensor(0.5) + save_file( + {"lm_head.weight": w_fp8, "lm_head.weight_scale_inv": scale, "layer.weight": torch.randn(64, 128)}, + shard_path, + ) + output, quantized, ignored = _process_shard( + shard_path, _DEFAULT_SCHEME, {}, ["lm_head"], device="cpu", fp8_block_size=None + ) + # lm_head should be ignored and kept in original FP8 format + assert "lm_head" in ignored + assert output["lm_head.weight"].dtype == torch.float8_e4m3fn + assert "lm_head.weight_scale_inv" in output + # non-ignored layer should be quantized normally + assert "layer" in quantized + # =========================================================================== # End-to-end ModelFreeQuantize @@ -305,13 +325,80 @@ def test_asym(self, tmp_path): assert qc["sym"] is False and qc["group_size"] == 64 +# =========================================================================== +# MXFP4 / MXFP8 model-free quantization +# =========================================================================== + + +class TestModelFreeMXFP: + """End-to-end tests for MXFP4/MXFP8 model-free quantization.""" + + def test_quantize_weight_mxfp4_shapes(self): + w = torch.randn(64, 128, dtype=torch.bfloat16) + out = _quantize_weight_mxfp(w, "layer", bits=4, group_size=32, data_type="mx_fp") + assert out["layer.weight_packed"].shape == (64, 64) # in_features / 2 + assert out["layer.weight_packed"].dtype == torch.uint8 + assert out["layer.weight_scale"].shape == (64, 4) # in_features / group_size + assert out["layer.weight_scale"].dtype == torch.uint8 + + def test_quantize_weight_mxfp8_shapes(self): + w = torch.randn(64, 128, dtype=torch.bfloat16) + out = _quantize_weight_mxfp(w, "layer", bits=8, group_size=32, data_type="mx_fp") + assert out["layer.weight"].shape == (64, 128) + assert out["layer.weight"].dtype == torch.float8_e4m3fn + assert out["layer.weight_scale"].shape == (64, 4) + assert out["layer.weight_scale"].dtype == torch.uint8 + + @pytest.mark.parametrize("scheme,fmt", [("MXFP4", "mxfp4-pack-quantized"), ("MXFP8", "mxfp8-quantized")]) + def test_e2e_mxfp(self, tmp_path, scheme, fmt): + tensors = { + "model.layers.0.self_attn.q_proj.weight": torch.randn(128, 128), + "model.layers.0.fc1.weight": torch.randn(512, 128), + "lm_head.weight": torch.randn(1000, 128), + } + model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, tensors) + output_dir = str(tmp_path / "output") + _ModelFreeCompressorCore(model_name_or_path=model_dir, output_dir=output_dir, scheme=scheme).run() + qc = _read_qconfig(output_dir) + assert qc["format"] == fmt + assert qc["quant_method"] == "compressed-tensors" + assert "lm_head" in qc["ignore"] + keys = _read_output_keys(output_dir) + # MXFP4 produces weight_packed, MXFP8 produces weight + if scheme == "MXFP4": + assert "model.layers.0.fc1.weight_packed" in keys + else: + assert "model.layers.0.fc1.weight" in keys + assert "model.layers.0.fc1.weight_scale" in keys + # lm_head stays full precision + assert "lm_head.weight" in keys + assert "lm_head.weight_packed" not in keys + + def test_mxfp4_via_autoround_api(self, tmp_path): + tensors = {"model.layers.0.fc1.weight": torch.randn(128, 128)} + model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, tensors) + output_dir = str(tmp_path / "output") + AutoRound(model=model_dir, scheme="MXFP4", model_free=True).quantize_and_save(output_dir) + qc = _read_qconfig(output_dir) + assert qc["format"] == "mxfp4-pack-quantized" + + def test_process_shard_mxfp(self, tmp_path): + shard_path = str(tmp_path / "shard.safetensors") + save_file({"layer.fc1.weight": torch.randn(64, 128)}, shard_path) + scheme = {"bits": 4, "group_size": 32, "sym": True, "data_type": "mx_fp"} + output, quantized, _ = _process_shard(shard_path, scheme, {}, []) + assert "layer.fc1" in quantized + assert "layer.fc1.weight_packed" in output + assert "layer.fc1.weight_scale" in output + + # =========================================================================== # Scheme validation # =========================================================================== -_SUPPORTED = ["W2A16", "W2A16G32", "W2A16G64", "W4A16", "W4A16_MIXED", "W8A16"] -_UNSUPPORTED = ["W3A16", "FPW8A16", "BF16", "MXFP4", "MXFP8", "MXINT4", "NVFP4", "FP8_BLOCK", "FP8_STATIC", "INT8_W8A8"] +_SUPPORTED = ["W2A16", "W2A16G32", "W2A16G64", "W4A16", "W4A16_MIXED", "W8A16", "MXFP4", "MXFP8"] +_UNSUPPORTED = ["W3A16", "FPW8A16", "BF16", "MXINT4", "NVFP4", "FP8_BLOCK", "FP8_STATIC", "INT8_W8A8"] class TestSchemeValidation: @@ -320,7 +407,11 @@ def test_supported(self, tmp_path, name): model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, {"model.layers.0.mlp.fc1.weight": torch.randn(64, 128)}) out = str(tmp_path / f"out_{name}") AutoRound(model=model_dir, scheme=name, model_free=True).quantize_and_save(out) - assert "model.layers.0.mlp.fc1.qweight" in _read_output_keys(out) + keys = _read_output_keys(out) + if name.startswith("MXFP"): + assert "model.layers.0.mlp.fc1.weight_scale" in keys + else: + assert "model.layers.0.mlp.fc1.qweight" in keys @pytest.mark.parametrize("name", _UNSUPPORTED) def test_unsupported_raises(self, tmp_path, name): From aedf04f58efbe35986e2352c318b31b7456b9767 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 28 May 2026 06:27:10 +0000 Subject: [PATCH 2/5] update document Signed-off-by: Xin He --- auto_round/compressors/model_free.py | 77 ++++++++++++++++++++++------ docs/step_by_step.md | 29 +++++++++-- docs/step_by_step_CN.md | 29 +++++++++-- 3 files changed, 112 insertions(+), 23 deletions(-) diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py index 54e773c4c..528cfdb1a 100644 --- a/auto_round/compressors/model_free.py +++ b/auto_round/compressors/model_free.py @@ -24,39 +24,67 @@ Supported schemes ----------------- -Model-free mode currently supports **integer weight-only** quantization -schemes packed in the ``auto_round:auto_gptq`` format only. Specifically: +Model-free mode supports the following quantization families: -* Preset names: ``W2A16``, ``W2A16G32``, ``W2A16G64``, ``W3A16``, ``W4A16``, - ``W8A16``. +**Integer weight-only** (packed in ``auto_round:auto_gptq`` format): + +* Preset names: ``W2A16``, ``W2A16G32``, ``W2A16G64``, ``W4A16``, + ``W4A16_MIXED``, ``W8A16``. * Custom :class:`~auto_round.schemes.QuantizationScheme` instances with - ``data_type="int"``, ``bits in {2, 3, 4, 8}``, ``act_bits >= 16``, and any + ``data_type="int"``, ``bits in {2, 4, 8}``, ``act_bits >= 16``, and any symmetric / asymmetric configuration. -Schemes that require special packing (FP8, MXFP4, NVFP4, GGUF, INT8_W8A8, +**MXFP (Microscaling Floating Point)** (packed in ``mxfp4-pack-quantized`` or +``mxfp8-quantized`` format, compatible with llm-compressor / compressed-tensors): + +* Preset names: ``MXFP4``, ``MXFP8``. +* ``data_type="mx_fp"``, ``group_size=32``, ``bits in {4, 8}``. + +Schemes that require special packing (FP8, NVFP4, GGUF, INT8_W8A8, BF16, FPW8A16, ...) are **not** supported in model-free mode and will raise ``ValueError``. Use the standard AutoRound flow for those. +Output formats +-------------- +* **INT schemes** → ``auto_round:auto_gptq`` packing format, ``quant_method="auto-round"``. +* **MXFP schemes** → ``mxfp4-pack-quantized`` or ``mxfp8-quantized`` format, + ``quant_method="compressed-tensors"``, compatible with vLLM / llm-compressor. + Usage (CLI) ----------- :: + # Integer WOQ auto_round facebook/opt-125m \\ --model_free \\ --scheme W4A16 \\ --output_dir int4-125m + # MXFP4 + auto_round facebook/opt-125m \\ + --model_free \\ + --scheme MXFP4 \\ + --output_dir mxfp4-125m + Usage (API) ----------- :: from auto_round import AutoRound + # Integer WOQ AutoRound( model="facebook/opt-125m", scheme="W4A16", model_free=True, ).quantize_and_save("./int4-125m") + + # MXFP4 + AutoRound( + model="facebook/opt-125m", + scheme="MXFP4", + model_free=True, + ).quantize_and_save("./mxfp4-125m") """ from __future__ import annotations @@ -88,9 +116,9 @@ # add "embed", "conv" in case of auto detection failure in _check_conv1d_and_embedding _BLOCK_NAME_TO_IGNORE = ["shared_expert_gate.", ".gate.", "embed", "conv"] -# Integer WOQ preset schemes that model-free mode can produce. -# Other presets (FP8/MX/NV/GGUF/BF16/INT8_W8A8/FPW8A16) require different -# packing kernels not implemented by ``quantize_weight_rtn``. +# Preset schemes that model-free mode can produce. +# INT presets use ``auto_round:auto_gptq`` packing; MXFP presets use +# ``mxfp4-pack-quantized`` or ``mxfp8-quantized`` (compressed-tensors) packing. # # Note: ``W3A16`` (3-bit) is intentionally excluded. 3-bit packing requires # in_features to be padded to a multiple of pack_factor=10, which the current @@ -1040,8 +1068,13 @@ def _validate_supported_scheme( ) -> None: """Raise ``ValueError`` if *scheme_obj* is not supported by model-free. - Model-free only supports integer weight-only quantization (sym/asym), - packed in the ``auto_round:auto_gptq`` format. + Model-free supports: + + * Integer weight-only quantization (sym/asym), ``bits ∈ {2, 4, 8}``, + packed in the ``auto_round:auto_gptq`` format. + * MXFP weight quantization (``data_type='mx_fp'``), ``bits ∈ {4, 8}``, + ``group_size=32``, packed in ``mxfp4-pack-quantized`` / ``mxfp8-quantized`` + format (compressed-tensors compatible). """ data_type = (scheme_obj.data_type or "int").lower() bits = scheme_obj.bits @@ -1124,14 +1157,19 @@ class _ModelFreeCompressorCore: Args: model_name_or_path: HuggingFace model ID or local directory path. output_dir: Directory to save the quantized model. - scheme: Quantization scheme name (e.g. ``"W4A16"``) or a - :class:`QuantizationScheme` instance. + scheme: Quantization scheme name (e.g. ``"W4A16"``, ``"MXFP4"``, + ``"MXFP8"``) or a :class:`QuantizationScheme` instance. layer_config: Per-layer quantization overrides. Keys are layer names or regex patterns; values are dicts with ``bits``, ``group_size``, ``sym`` etc. ignore_layers: Comma-separated list of layer name patterns to keep - in full precision. - format: Output format (only ``"auto_round"`` is supported). + in full precision. Ignored layers that are already quantized + (e.g. FP8) are preserved in their original format. + format: Output format. Supported: ``"auto_round"``, + ``"auto_round:auto_gptq"``, ``"llm_compressor"``, + ``"auto_round:llm_compressor"``. The packing format is + auto-selected based on the scheme (INT→auto_gptq, + MXFP→compressed-tensors). device: Device for quantization computation (``"cpu"`` or ``"cuda"``). quant_lm_head: If True, quantize ``lm_head`` as well. By default @@ -1572,9 +1610,18 @@ def run(self) -> str: self._detect_fp8_source() self._discover_shards() + # Determine the output packing format based on scheme data type + data_type = (self.default_scheme.get("data_type") or "int").lower() + if is_mx_fp(data_type): + bits = self.default_scheme.get("bits", 4) + packing_format = "mxfp4-pack-quantized" if bits == 4 else "mxfp8-quantized" + else: + packing_format = "auto_round:auto_gptq" + logger.info( f"Model-free quantization: {self.model_name_or_path}\n" f" Scheme: {self.scheme_obj}\n" + f" Packing format: {packing_format}\n" f" Output: {self.output_dir}\n" f" Shards: {len(self.shard_names)}\n" f" Streaming download: {self.is_streaming}\n" diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 4a2a8052d..a321e24b1 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -523,7 +523,7 @@ ar.quantize_and_save(output_dir, format="auto_round") Model-free mode performs RTN WOQ quantization **without loading the full model into memory**. It downloads safetensors files directly, quantizes each Linear weight tensor shard-by-shard, and saves the packed result. This is useful when you want fast, no-calibration quantization with minimal resource requirements. -> **Auto-enabled by default.** As of v0.13, when you pass `--iters 0 --disable_opt_rtn` together with a supported INT WOQ scheme, the CLI automatically takes the model-free path. This is **bit-exactly equivalent** to the regular `--iters 0 --disable_opt_rtn` flow but uses far less memory. Use `--disable_model_free` to opt out and force the original flow. +> **Auto-enabled by default.** As of v0.13, when you pass `--iters 0 --disable_opt_rtn` together with a supported INT WOQ or MXFP scheme, the CLI automatically takes the model-free path. This is **bit-exactly equivalent** to the regular `--iters 0 --disable_opt_rtn` flow but uses far less memory. Use `--disable_model_free` to opt out and force the original flow. **Key features:** - **No model object required** – only `config.json` and safetensors files are needed @@ -537,7 +537,9 @@ Model-free mode performs RTN WOQ quantization **without loading the full model i **Supported schemes** -Model-free mode currently supports the following **integer weight-only** preset schemes (packed in the `auto_round:auto_gptq` format): +Model-free mode supports the following preset schemes: + +**Integer weight-only** (packed in `auto_round:auto_gptq` format): | Preset | Bits | Group size | Sym | | --- | --- | --- | --- | @@ -552,7 +554,14 @@ All of the above presets also support **asymmetric quantization** (`sym=False`) You can also pass a custom `QuantizationScheme(bits=N, group_size=G, sym=True/False, data_type="int", act_bits=16)` with `bits ∈ {2, 4, 8}` and any group_size / sym configuration. -Schemes that require special packing kernels (`W3A16`, `FPW8A16`, `BF16`, `MXFP4`, `MXFP8`, `MXINT4`, `NVFP4`, `FP8_BLOCK`, `FP8_STATIC`, `INT8_W8A8`, `GGUF:*`, ...) are **not** supported in model-free mode and will raise `ValueError`. Use the regular AutoRound flow for those. +**MXFP (Microscaling Floating Point)** (packed in `mxfp4-pack-quantized` / `mxfp8-quantized` format, compatible with compressed-tensors / vLLM): + +| Preset | Bits | Group size | Format | +| --- | --- | --- | --- | +| `MXFP4` | 4 | 32 | mxfp4-pack-quantized | +| `MXFP8` | 8 | 32 | mxfp8-quantized | + +Schemes that require special packing kernels (`W3A16`, `FPW8A16`, `BF16`, `MXINT4`, `NVFP4`, `FP8_BLOCK`, `FP8_STATIC`, `INT8_W8A8`, `GGUF:*`, ...) are **not** supported in model-free mode and will raise `ValueError`. Use the regular AutoRound flow for those. #### CLI Usage @@ -584,6 +593,18 @@ auto_round meta-llama/Llama-3.2-1B-Instruct \ --layer_config "{k_proj:{bits:8},v_proj:{bits:8}}" \ --ignore_layers "mlp" \ --output_dir ./int4-llama + +# MXFP4 quantization +auto_round meta-llama/Llama-3.2-1B-Instruct \ + --model_free \ + --scheme MXFP4 \ + --output_dir ./mxfp4-llama + +# MXFP8 quantization +auto_round meta-llama/Llama-3.2-1B-Instruct \ + --model_free \ + --scheme MXFP8 \ + --output_dir ./mxfp8-llama ``` #### API Usage @@ -603,7 +624,7 @@ AutoRound( ).quantize_and_save("./int4-llama") ``` -> **Note:** Model-free mode only supports the `auto_round` output format and uses RTN (no calibration data, no iterative tuning). For higher-quality quantization or schemes outside the supported list, use the standard AutoRound flow. +> **Note:** Model-free mode uses RTN (no calibration data, no iterative tuning). INT schemes output in `auto_round:auto_gptq` format; MXFP schemes output in compressed-tensors format (`mxfp4-pack-quantized` / `mxfp8-quantized`). For higher-quality quantization or schemes outside the supported list, use the standard AutoRound flow. diff --git a/docs/step_by_step_CN.md b/docs/step_by_step_CN.md index cf4347df0..93a375a3b 100644 --- a/docs/step_by_step_CN.md +++ b/docs/step_by_step_CN.md @@ -520,7 +520,7 @@ ar.quantize_and_save(output_dir, format="auto_round") 免模型架构量化模式(Model-Free Mode)可以**无需将完整模型加载到内存中**即可执行 RTN WOQ 量化。它直接下载 safetensors 文件,逐分片地对每个 Linear 权重张量进行量化并保存打包结果。当您需要快速、无标定数据的量化且资源有限时,该模式非常实用。 -> **默认自动启用。** 自 v0.13 起,当您同时传入 `--iters 0 --disable_opt_rtn` 与一个受支持的 INT WOQ scheme 时,CLI 会自动走免模型路径。该路径与原始 `--iters 0 --disable_opt_rtn` 流程**位级(bit-exact)等价**,但内存占用大幅降低。如需关闭自动路由、强制使用原始流程,可加 `--disable_model_free`。 +> **默认自动启用。** 自 v0.13 起,当您同时传入 `--iters 0 --disable_opt_rtn` 与一个受支持的 INT WOQ 或 MXFP scheme 时,CLI 会自动走免模型路径。该路径与原始 `--iters 0 --disable_opt_rtn` 流程**位级(bit-exact)等价**,但内存占用大幅降低。如需关闭自动路由、强制使用原始流程,可加 `--disable_model_free`。 **主要特性:** - **无需模型对象** — 仅需 `config.json` 和 safetensors 文件 @@ -534,7 +534,9 @@ ar.quantize_and_save(output_dir, format="auto_round") **支持的 Scheme** -免模型模式当前支持以下整数权重量化预设(均使用 `auto_round:auto_gptq` 打包格式): +免模型模式支持以下量化预设: + +**整数权重量化**(使用 `auto_round:auto_gptq` 打包格式): | Preset | Bits | Group size | Sym | | --- | --- | --- | --- | @@ -549,7 +551,14 @@ ar.quantize_and_save(output_dir, format="auto_round") 也可以传入自定义的 `QuantizationScheme(bits=N, group_size=G, sym=True/False, data_type="int", act_bits=16)`,其中 `bits ∈ {2, 4, 8}`,group_size / sym 可任意设置。 -需要特殊打包内核的 scheme(`W3A16`、`FPW8A16`、`BF16`、`MXFP4`、`MXFP8`、`MXINT4`、`NVFP4`、`FP8_BLOCK`、`FP8_STATIC`、`INT8_W8A8`、`GGUF:*` 等)**不被支持**,传入会抛 `ValueError`。这些请使用标准 AutoRound 流程。 +**MXFP(微缩放浮点)**(使用 `mxfp4-pack-quantized` / `mxfp8-quantized` 格式,兼容 compressed-tensors / vLLM): + +| Preset | Bits | Group size | 格式 | +| --- | --- | --- | --- | +| `MXFP4` | 4 | 32 | mxfp4-pack-quantized | +| `MXFP8` | 8 | 32 | mxfp8-quantized | + +需要特殊打包内核的 scheme(`W3A16`、`FPW8A16`、`BF16`、`MXINT4`、`NVFP4`、`FP8_BLOCK`、`FP8_STATIC`、`INT8_W8A8`、`GGUF:*` 等)**不被支持**,传入会抛 `ValueError`。这些请使用标准 AutoRound 流程。 #### 命令行用法 @@ -581,6 +590,18 @@ auto_round meta-llama/Llama-3.2-1B-Instruct \ --layer_config "{k_proj:{bits:8},v_proj:{bits:8}}" \ --ignore_layers "mlp" \ --output_dir ./int4-llama + +# MXFP4 量化 +auto_round meta-llama/Llama-3.2-1B-Instruct \ + --model_free \ + --scheme MXFP4 \ + --output_dir ./mxfp4-llama + +# MXFP8 量化 +auto_round meta-llama/Llama-3.2-1B-Instruct \ + --model_free \ + --scheme MXFP8 \ + --output_dir ./mxfp8-llama ``` #### API 用法 @@ -600,7 +621,7 @@ AutoRound( ).quantize_and_save("./int4-llama") ``` -> **注意:** 免模型量化模式仅支持 `auto_round` 输出格式,并使用 RTN(无标定数据、无迭代调优)。如需更高质量的量化结果或使用受支持列表外的 scheme,请使用标准 AutoRound 流程。 +> **注意:** 免模型量化模式使用 RTN(无标定数据、无迭代调优)。INT scheme 输出为 `auto_round:auto_gptq` 格式;MXFP scheme 输出为 compressed-tensors 格式(`mxfp4-pack-quantized` / `mxfp8-quantized`)。如需更高质量的量化结果或使用受支持列表外的 scheme,请使用标准 AutoRound 流程。 From 6c3019d04a00d452630fea4cbc22fee488149b97 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 29 May 2026 13:10:09 +0800 Subject: [PATCH 3/5] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- auto_round/compressors/model_free.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py index 528cfdb1a..4cc27d1f0 100644 --- a/auto_round/compressors/model_free.py +++ b/auto_round/compressors/model_free.py @@ -876,12 +876,10 @@ def _build_mxfp_quantization_config( fmt = "mxfp4-pack-quantized" if bits == 4 else "mxfp8-quantized" # Default ignore list: any layer present in ignored_layers (deduped) that - # was NOT quantized. Always include 'lm_head' if it appears in ignore set. + # was NOT quantized. ignore = list(dict.fromkeys(ignored_layers)) quant_set = set(quantized_layers) ignore = [n for n in ignore if n not in quant_set] - if "lm_head" not in ignore: - ignore.append("lm_head") qconfig = initialize_quantization(scheme=scheme_name, ignore=ignore) qconfig = qconfig.to_dict() From 82e0962a32dff3124d1d8fa688de685a1669b4f2 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 29 May 2026 13:42:55 +0800 Subject: [PATCH 4/5] update per comments Signed-off-by: Xin He --- auto_round/compressors/model_free.py | 19 ++++++++++++++++-- test/test_cpu/quantization/test_model_free.py | 20 ++++++++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/model_free.py b/auto_round/compressors/model_free.py index 4cc27d1f0..74397f1e0 100644 --- a/auto_round/compressors/model_free.py +++ b/auto_round/compressors/model_free.py @@ -813,9 +813,14 @@ def _process_shard( # Preserve original tensors for ignored/skipped layers so that already- # quantized weights (FP8, FP4-packed, etc.) are NOT dequantized. + # Check both ".weight" and ".weight_packed" so that layers whose primary + # tensor uses non-standard naming (e.g. already-quantized FP4-packed layers + # stored as ".weight_packed") are correctly captured. preserved_prefixes: set[str] = set() for tname in raw_tensors: - if tname.endswith(".weight") and (matcher.should_ignore(tname) or matcher.should_skip(tname)): + if (tname.endswith(".weight") or tname.endswith(".weight_packed") or tname.endswith(".qweight")) and ( + matcher.should_ignore(tname) or matcher.should_skip(tname) + ): preserved_prefixes.add(tname.rsplit(".", 1)[0]) preserved_tensors: dict[str, torch.Tensor] = {} @@ -1082,6 +1087,16 @@ def _validate_supported_scheme( # Activation quantization for MXFP is dynamic at inference time, so the # weight-only RTN path here is independent of act_bits. if is_mx_fp(data_type): + # Restrict to the two explicitly supported MXFP presets when a string + # name is provided. Variants such as MXFP4_RCEIL / MXFP8_RCEIL use a + # different activation format; silently mapping them to "MXFP4" / + # "MXFP8" in the output config would misrepresent the requested scheme. + if isinstance(scheme_input, str) and scheme_input not in ("MXFP4", "MXFP8"): + raise ValueError( + f"Model-free mode only supports MXFP preset names 'MXFP4' and 'MXFP8', " + f"but got '{scheme_input}'. " + f"Supported preset schemes: {list(SUPPORTED_PRESET_SCHEMES)}." + ) if bits is None or bits not in _SUPPORTED_MXFP_BITS: raise ValueError( f"Model-free mode supports MXFP bits in {_SUPPORTED_MXFP_BITS}, " @@ -1130,7 +1145,7 @@ def is_model_free_supported_scheme( """ try: scheme_obj = _apply_scheme_overrides(scheme, scheme_overrides) - _validate_supported_scheme(scheme_obj, scheme_obj) + _validate_supported_scheme(scheme_obj, scheme) return True except (ValueError, TypeError): return False diff --git a/test/test_cpu/quantization/test_model_free.py b/test/test_cpu/quantization/test_model_free.py index dc0e6f59e..ea3cbf21d 100644 --- a/test/test_cpu/quantization/test_model_free.py +++ b/test/test_cpu/quantization/test_model_free.py @@ -34,6 +34,8 @@ ) from auto_round.schemes import QuantizationScheme +from ...envs import require_compressed_tensors + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -349,6 +351,7 @@ def test_quantize_weight_mxfp8_shapes(self): assert out["layer.weight_scale"].shape == (64, 4) assert out["layer.weight_scale"].dtype == torch.uint8 + @require_compressed_tensors @pytest.mark.parametrize("scheme,fmt", [("MXFP4", "mxfp4-pack-quantized"), ("MXFP8", "mxfp8-quantized")]) def test_e2e_mxfp(self, tmp_path, scheme, fmt): tensors = { @@ -374,6 +377,7 @@ def test_e2e_mxfp(self, tmp_path, scheme, fmt): assert "lm_head.weight" in keys assert "lm_head.weight_packed" not in keys + @require_compressed_tensors def test_mxfp4_via_autoround_api(self, tmp_path): tensors = {"model.layers.0.fc1.weight": torch.randn(128, 128)} model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, tensors) @@ -382,6 +386,7 @@ def test_mxfp4_via_autoround_api(self, tmp_path): qc = _read_qconfig(output_dir) assert qc["format"] == "mxfp4-pack-quantized" + @require_compressed_tensors def test_process_shard_mxfp(self, tmp_path): shard_path = str(tmp_path / "shard.safetensors") save_file({"layer.fc1.weight": torch.randn(64, 128)}, shard_path) @@ -398,12 +403,25 @@ def test_process_shard_mxfp(self, tmp_path): _SUPPORTED = ["W2A16", "W2A16G32", "W2A16G64", "W4A16", "W4A16_MIXED", "W8A16", "MXFP4", "MXFP8"] -_UNSUPPORTED = ["W3A16", "FPW8A16", "BF16", "MXINT4", "NVFP4", "FP8_BLOCK", "FP8_STATIC", "INT8_W8A8"] +_UNSUPPORTED = [ + "W3A16", + "FPW8A16", + "BF16", + "MXINT4", + "NVFP4", + "FP8_BLOCK", + "FP8_STATIC", + "INT8_W8A8", + "MXFP4_RCEIL", + "MXFP8_RCEIL", +] class TestSchemeValidation: @pytest.mark.parametrize("name", _SUPPORTED) def test_supported(self, tmp_path, name): + if name.startswith("MXFP"): + pytest.importorskip("compressed_tensors", reason="test requires compressed-tensors") model_dir = _make_model_dir(tmp_path, _LLAMA_CFG, {"model.layers.0.mlp.fc1.weight": torch.randn(64, 128)}) out = str(tmp_path / f"out_{name}") AutoRound(model=model_dir, scheme=name, model_free=True).quantize_and_save(out) From 87a12cb996cb8caa1480e6f1e5a4c42310bc8301 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 29 May 2026 16:03:07 +0800 Subject: [PATCH 5/5] fix segmentation failure Signed-off-by: Xin He --- .../export/test_auto_round_format.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py index d2b2f6814..4bd85f250 100644 --- a/test/test_cuda/export/test_auto_round_format.py +++ b/test/test_cuda/export/test_auto_round_format.py @@ -39,7 +39,7 @@ def _save_dir(self, tmp_path): shutil.rmtree(self.save_dir, ignore_errors=True) @require_greater_than_050 - @pytest.mark.parametrize("bits", [2, 3, 4, 8]) + @pytest.mark.parametrize("bits", [2, 4, 8]) @pytest.mark.parametrize("group_size", [32, 128]) @pytest.mark.parametrize("is_sym", [True, False]) def test_autoround_format(self, tiny_opt_model_path, bits, group_size, is_sym): @@ -59,6 +59,28 @@ def test_autoround_format(self, tiny_opt_model_path, bits, group_size, is_sym): model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", trust_remote_code=True) assert isinstance(model, torch.nn.Module), "Loaded model is not an instance of torch.nn.Module" + # Split 3 bits test with [2,4,8] bits to avoid segmentation fault + @require_greater_than_050 + @pytest.mark.parametrize("bits", [3]) + @pytest.mark.parametrize("group_size", [32, 128]) + @pytest.mark.parametrize("is_sym", [True, False]) + def test_autoround_format_3bit(self, tiny_opt_model_path, bits, group_size, is_sym): + autoround = AutoRound( + tiny_opt_model_path, + bits=bits, + group_size=group_size, + sym=is_sym, + iters=0, + disable_opt_rtn=True, + ) + quantized_model_path = self.save_dir + + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + + # Verify loading + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", trust_remote_code=True) + assert isinstance(model, torch.nn.Module), "Loaded model is not an instance of torch.nn.Module" + @pytest.mark.skip_ci(reason="Time-consuming; Accuracy evaluation") @require_autogptq def test_mixed_precision(self):