From 1c74b8b59ee539e191bc0bab845d1eba754672e3 Mon Sep 17 00:00:00 2001
From: realAsma <86726418+realAsma@users.noreply.github.com>
Date: Tue, 25 Nov 2025 21:23:40 -0800
Subject: [PATCH 1/8] [1/N] Refactored AutoQuantizeSearcher to
 _AutoQuantizeBaseSearcher & AutoQuantizeGradientSearcher; seperated quant
 modules and score modules (#586)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What does this PR do?

**Type of change:**  Refator; Minor new feature

**Overview:** ?

1. Refactored AutoQuantizeSearcher to _AutoQuantizeBaseSearcher &
AutoQuantizeGradientSearcher - Prepares architecture for additional
search methods.
2. seperated quant modules and score modules - separate quantization
modules from scoring modules, enabling auto-quantization to measure
sensitivity at parent layers (e.g., MLP output for MoE experts) rather
than individual ops.
3. Also see https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/592
and https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/588

## Testing
See unittests; `tests/unit/torch/quantization/test_autoquant.py` and
`tests/unit/torch/quantization/plugins/test_huggingface.py`

## Before your PR is "*Ready for review*"
<!-- If you haven't finished some of the above items you can still open
`Draft` PR. -->

- **Make sure you read and follow [Contributor
guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)**
and your commits are signed.
- **Is this change backward compatible?**: Yes
- **Did you write any new necessary tests?**: Yes
- **Did you add or update any necessary documentation?**: Yes
- **Did you update
[Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**:
Not Required

## Additional Information
<!-- E.g. related issue. -->

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **New Features**
  * Added support for score modules in quantization workflows.
  * Added optional naming for quantization recipes.

* **Bug Fixes**
* Improved quantization grouping rules documentation with clearer
configuration examples.

* **Refactor**
  * Renamed quantization module parameters for improved clarity.
  * Enhanced quantization search architecture for better scalability.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: realAsma <akuriparambi@nvidia.com>
Co-authored-by: Asma Kuriparambil Thekkumpate <akuriparambi@akuriparambi-mlt.client.nvidia.com>
Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 CHANGELOG.rst                                 |    2 +
 examples/llm_eval/gen_model_answer.py         |   35 +-
 examples/llm_eval/lm_eval_hf.py               |   39 +-
 examples/llm_eval/mmlu.py                     |    6 +
 examples/llm_eval/quantization_utils.py       |   65 +-
 examples/llm_ptq/hf_ptq.py                    |   73 +-
 .../llm_ptq/scripts/huggingface_example.sh    |   22 +
 examples/llm_ptq/scripts/parser.sh            |    8 +-
 modelopt/torch/opt/hparam.py                  |   10 +-
 modelopt/torch/quantization/algorithms.py     | 1075 ++++++++++++-----
 modelopt/torch/quantization/model_quant.py    |   78 +-
 .../torch/quantization/plugins/huggingface.py |    4 +-
 ...rt_weight.py => test_export_weight_gpu.py} |    0
 .../quantization/plugins/test_huggingface.py  |   28 +-
 .../unit/torch/quantization/test_autoquant.py |   66 +-
 15 files changed, 1170 insertions(+), 341 deletions(-)
 rename tests/gpu/torch/export/{test_export_weight.py => test_export_weight_gpu.py} (100%)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 899b14009..beb01abf0 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -14,6 +14,8 @@ Model Optimizer Changelog (Linux)
 - Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
 - Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
+- Add KL Divergence loss based auto_quantize method. See `auto_quantize API docs <https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize>`_ for more details.
+- Add support for saving and resuming auto_quantize search state. This speeds up the auto_quantize process by skipping the score estimation step if the search state is provided.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
 - Add per tensor and per channel MSE calibrator support.
diff --git a/examples/llm_eval/gen_model_answer.py b/examples/llm_eval/gen_model_answer.py
index 86504db62..42a7eaac9 100644
--- a/examples/llm_eval/gen_model_answer.py
+++ b/examples/llm_eval/gen_model_answer.py
@@ -201,8 +201,11 @@ def get_model_answers(
                 tokenizer,
                 args.calib_batch_size,
                 args.calib_size,
-                args.auto_quantize_bits,
                 test_generated=False,
+                auto_quantize_bits=args.auto_quantize_bits,
+                auto_quantize_method=args.auto_quantize_method,
+                auto_quantize_score_size=args.auto_quantize_score_size,
+                auto_quantize_checkpoint=args.auto_quantize_checkpoint,
             )
 
     for question in tqdm(questions):
@@ -450,6 +453,36 @@ def reorg_answer_file(answer_file):
             "regular quantization without auto_quantize search will be applied."
         ),
     )
+    parser.add_argument(
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Path to checkpoint file for saving/restoring auto_quantize search state "
+            "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
+        ),
+    )
     parser.add_argument(
         "--trust_remote_code",
         help="Set trust_remote_code for Huggingface models and tokenizers",
diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py
index e980a376e..31103ff86 100755
--- a/examples/llm_eval/lm_eval_hf.py
+++ b/examples/llm_eval/lm_eval_hf.py
@@ -53,6 +53,9 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
 
     quant_cfg = arg_dict.pop("quant_cfg", None)
     auto_quantize_bits = arg_dict.pop("auto_quantize_bits", None)
+    auto_quantize_method = arg_dict.pop("auto_quantize_method", "gradient")
+    auto_quantize_score_size = arg_dict.pop("auto_quantize_score_size", 128)
+    auto_quantize_checkpoint = arg_dict.pop("auto_quantize_checkpoint", None)
     calib_batch_size = arg_dict.pop("calib_batch_size", None)
     calib_size = arg_dict.pop("calib_size", 512)
     compress = arg_dict.pop("compress", False)
@@ -81,8 +84,11 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
             batch_size=calib_batch_size,
             calib_size=calib_size,
             auto_quantize_bits=auto_quantize_bits,
+            auto_quantize_method=auto_quantize_method,
+            auto_quantize_score_size=auto_quantize_score_size,
             test_generated=False,
             compress=compress,
+            auto_quantize_checkpoint=auto_quantize_checkpoint,
         )
 
     return model_obj
@@ -101,6 +107,12 @@ def setup_parser_with_modelopt_args():
             "comma-separated list of quantization quantization formats that will be searched by `auto_quantize`"
         ),
     )
+    parser.add_argument(
+        "--calib_batch_size", type=int, help="Batch size for quantization calibration"
+    )
+    parser.add_argument(
+        "--calib_size", type=int, help="Calibration size for quantization", default=512
+    )
     parser.add_argument(
         "--auto_quantize_bits",
         type=float,
@@ -110,10 +122,30 @@ def setup_parser_with_modelopt_args():
         ),
     )
     parser.add_argument(
-        "--calib_batch_size", type=int, help="Batch size for quantization calibration"
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
     )
     parser.add_argument(
-        "--calib_size", type=int, help="Calibration size for quantization", default=512
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        help=("Path to checkpoint file for saving/restoring auto_quantize search state. "),
     )
     parser.add_argument(
         "--compress",
@@ -139,6 +171,9 @@ def setup_parser_with_modelopt_args():
         {
             "quant_cfg": args.quant_cfg,
             "auto_quantize_bits": args.auto_quantize_bits,
+            "auto_quantize_method": args.auto_quantize_method,
+            "auto_quantize_score_size": args.auto_quantize_score_size,
+            "auto_quantize_checkpoint": args.auto_quantize_checkpoint,
             "calib_batch_size": args.calib_batch_size,
             "calib_size": args.calib_size,
             "compress": args.compress,
diff --git a/examples/llm_eval/mmlu.py b/examples/llm_eval/mmlu.py
index 6a2f70ce4..ca244052b 100755
--- a/examples/llm_eval/mmlu.py
+++ b/examples/llm_eval/mmlu.py
@@ -227,6 +227,9 @@ def main(
     batch_size: int = 0,
     calib_size: int = 512,
     dtype: str = "bfloat16",
+    auto_quantize_method: str = "gradient",
+    auto_quantize_score_size: int = 128,
+    auto_quantize_checkpoint: str | None = None,
     **kwargs,
 ):
     random.seed(RAND_SEED)
@@ -281,6 +284,9 @@ def main(
                     batch_size=batch_size,
                     calib_size=calib_size,
                     auto_quantize_bits=auto_quantize_bits,
+                    auto_quantize_method=auto_quantize_method,
+                    auto_quantize_score_size=auto_quantize_score_size,
+                    auto_quantize_checkpoint=auto_quantize_checkpoint,
                 )
 
     for subject in tqdm(subjects):
diff --git a/examples/llm_eval/quantization_utils.py b/examples/llm_eval/quantization_utils.py
index 2f43c93e0..3df44115a 100644
--- a/examples/llm_eval/quantization_utils.py
+++ b/examples/llm_eval/quantization_utils.py
@@ -66,8 +66,11 @@ def _quantize_model_with_dataset(
     quant_cfg: str | list[str],
     calib_dataset,
     auto_quantize_bits=None,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
     batch_size=1,
     compress=False,
+    auto_quantize_checkpoint=None,
 ):
     if hasattr(lm, "gpt2"):
         net = lm.gpt2
@@ -81,23 +84,42 @@ def _quantize_model_with_dataset(
             getattr(mtq, quant_fmt) for quant_fmt in quant_cfg if quant_fmt != "NONE"
         ]
 
-        def loss_func(output, data):
-            # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
-            # which contains the loss attribute.
-            return output.loss
+        # Configure forward_step and loss_func based on method
+        if auto_quantize_method == "gradient":
+            # For gradient-based method, return full output with loss
+            def forward_step(model, batch):
+                return model(**batch)
+
+            def loss_func(output, data):
+                # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
+                # which contains the loss attribute.
+                return output.loss
+        elif auto_quantize_method == "kl_div":
+            # For KL divergence method, return only logits
+            def forward_step(model, batch):
+                return model(**batch).logits
+
+            loss_func = None  # KL divergence doesn't need a custom loss function
+        else:
+            raise ValueError(
+                f"Invalid auto_quantize_method: {auto_quantize_method}. "
+                "Must be 'gradient' or 'kl_div'"
+            )
 
         net, _ = mtq.auto_quantize(
             net,
             constraints={"effective_bits": auto_quantize_bits},
             quantization_formats=quant_cfg_for_search,
             data_loader=calib_dataset,
-            forward_step=lambda model, batch: model(**batch),
+            forward_step=forward_step,
             loss_func=loss_func,
             num_calib_steps=len(calib_dataset),
-            num_score_steps=min(
-                len(calib_dataset), 128 // batch_size
-            ),  # Limit the number of score steps to avoid long calibration time
+            # Most time is spent on score estimation; fewer samples speed it up with little accuracy impact.
+            num_score_steps=min(len(calib_dataset), max(auto_quantize_score_size // batch_size, 1)),
             verbose=True,
+            method=auto_quantize_method,
+            # disabled_layers=["*lm_head*", "*mlp.gate.*"],
+            checkpoint=auto_quantize_checkpoint,
         )
     else:
         mtq_cfg = CUSTOM_CONFIG.get(quant_cfg)  # type: ignore [arg-type]
@@ -141,10 +163,13 @@ def quantize_model(
     tokenizer,
     batch_size,
     calib_size,
-    auto_quantize_bits=None,
     data="cnn_dailymail",
     test_generated=True,
     compress=False,
+    auto_quantize_bits=None,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
+    auto_quantize_checkpoint=None,
 ):
     """Quantizes the model with the provided calibration dataset.
 
@@ -155,10 +180,14 @@ def quantize_model(
         tokenizer: the tokenizer.
         batch_size: the calibration batch size for each calibration inference run.
         calib_size: the total calibration dataset size.
-        auto_quantize_bits: The effective bits constraint for auto_quantize.
         data: the name of the calibration dataset.
         test_generated:  If ``True``, test the generated text before and after quantization.
         compress: If ``True``, compress the model after quantization.
+        auto_quantize_bits: The effective bits constraint for auto_quantize.
+        auto_quantize_method: The method for auto_quantize ('gradient' or 'kl_div').
+        auto_quantize_score_size: Number of samples used for auto_quantize scoring.
+        auto_quantize_checkpoint: Path to checkpoint file for saving/restoring auto_quantize search state
+            (sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified.
     """
     if "AWQ" in quant_cfg:
         print(
@@ -170,8 +199,10 @@ def quantize_model(
     if hasattr(model, "model"):
         device = model.model.device
 
+    is_gradient_based = auto_quantize_bits is not None and auto_quantize_method == "gradient"
+
     if batch_size == 0:
-        if auto_quantize_bits is not None or torch.distributed.is_initialized():
+        if is_gradient_based or torch.distributed.is_initialized():
             raise ValueError("We dont support automatic batch size inference for this case.")
 
         net = model.gpt2 if hasattr(model, "gpt2") else model.model
@@ -186,7 +217,7 @@ def quantize_model(
         batch_size=batch_size,
         num_samples=calib_size,
         device=device,
-        include_labels=auto_quantize_bits is not None,
+        include_labels=is_gradient_based,
     )
 
     if test_generated:
@@ -194,7 +225,15 @@ def quantize_model(
         generated_str_before_ptq = model.run(input_str)
 
     _quantize_model_with_dataset(
-        model, quant_cfg, calib_dataloader, auto_quantize_bits, batch_size, compress
+        model,
+        quant_cfg,
+        calib_dataloader,
+        auto_quantize_bits,
+        auto_quantize_method,
+        auto_quantize_score_size,
+        batch_size,
+        compress,
+        auto_quantize_checkpoint,
     )
 
     if test_generated:
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 7bb8d0f28..f0bb56caa 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -95,7 +95,15 @@
 
 
 def auto_quantize(
-    model, qformat, auto_quantize_bits, calib_dataloader, calibrate_loop, batch_size=1
+    model,
+    qformat,
+    calib_dataloader,
+    calibrate_loop,
+    auto_quantize_bits,
+    batch_size=1,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
+    auto_quantize_checkpoint=None,
 ):
     qformat_list = qformat.split(",")
     assert qformat_list, "No quantization formats provided"
@@ -122,18 +130,34 @@ def loss_func(output, data):
         # which contains the loss attribute.
         return output.loss
 
+    if auto_quantize_method == "gradient":
+        # For gradient-based method, return full output with loss
+        def forward_step(model, batch):
+            return model(**batch)
+    elif auto_quantize_method == "kl_div":
+        # For KL divergence method, return only logits
+        def forward_step(model, batch):
+            return model(**batch).logits
+    else:
+        raise ValueError(
+            f"Invalid auto_quantize_method: {auto_quantize_method}. Must be 'gradient' or 'kl_div'"
+        )
+
     model, _ = mtq.auto_quantize(
         model,
         constraints={"effective_bits": auto_quantize_bits},
         data_loader=calib_dataloader,
-        forward_step=lambda model, batch: model(**batch),
-        loss_func=loss_func,
+        forward_step=forward_step,
+        loss_func=loss_func,  # Only used for gradient-based method
         # TRTLLM only support one quantization format or None (do not quantize, internally supported)
         quantization_formats=[QUANT_CFG_CHOICES[format] for format in qformat_list],
         num_calib_steps=len(calib_dataloader),
-        num_score_steps=len(calib_dataloader),
+        # AutoQuantize scoring is the costly phase; allow smaller sample counts than calibration.
+        num_score_steps=min(len(calib_dataloader), max(auto_quantize_score_size // batch_size, 1)),
         verbose=True,
         disabled_layers=["*lm_head*"],
+        method=auto_quantize_method,
+        checkpoint=auto_quantize_checkpoint,
     )
 
     # We need to explicitly calibrate for kv cache quantization
@@ -191,10 +215,13 @@ def quantize_model(model, quant_cfg, args, calib_dataloader=None, calibration_on
         model = auto_quantize(
             model,
             args.qformat,
-            args.auto_quantize_bits,
             calib_dataloader,
             calibrate_loop,
+            args.auto_quantize_bits,
             args.batch_size,
+            args.auto_quantize_method,
+            args.auto_quantize_score_size,
+            args.auto_quantize_checkpoint,
         )
     elif calibration_only:
         model = mtq.calibrate(model, quant_cfg["algorithm"], forward_loop=calibrate_loop)
@@ -444,13 +471,17 @@ def main(args):
             assert tokenizer is not None and isinstance(
                 tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
             ), "The PreTrainedTokenizer must be set"
+            # Labels are only needed for gradient-based auto_quantize
+            include_labels = (
+                args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
+            )
             calib_dataloader = get_dataset_dataloader(
                 dataset_name=args.dataset,
                 tokenizer=tokenizer,
                 batch_size=args.batch_size,
                 num_samples=args.calib_size,
                 device=device,
-                include_labels=args.auto_quantize_bits is not None,
+                include_labels=include_labels,
             )
 
         quant_cfg = build_quant_cfg(
@@ -803,6 +834,36 @@ def output_decode(generated_ids, input_shape):
         default=None,
         type=str,
     )
+    parser.add_argument(
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Path to checkpoint file for saving/restoring auto_quantize search state "
+            "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
+        ),
+    )
 
     args = parser.parse_args()
 
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
index 7b7d6910e..043b690e5 100755
--- a/examples/llm_ptq/scripts/huggingface_example.sh
+++ b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -93,6 +93,28 @@ fi
 if [ -n "$AUTO_QUANTIZE_BITS" ]; then
     PTQ_ARGS+=" --auto_quantize_bits=$AUTO_QUANTIZE_BITS "
 fi
+
+if [ -n "$AUTO_QUANTIZE_METHOD" ]; then
+    PTQ_ARGS+=" --auto_quantize_method=$AUTO_QUANTIZE_METHOD "
+fi
+
+if [ -n "$AUTO_QUANTIZE_SCORE_SIZE" ]; then
+    PTQ_ARGS+=" --auto_quantize_score_size=$AUTO_QUANTIZE_SCORE_SIZE "
+fi
+
+# Automatically generate auto_quantize checkpoint path if not provided
+if [ -n "$AUTO_QUANTIZE_BITS" ] && [ -z "$AUTO_QUANTIZE_CHECKPOINT" ]; then
+    # Create a descriptive checkpoint name based on model and quantization settings
+    AQ_METHOD=${AUTO_QUANTIZE_METHOD:-gradient}
+    AUTO_QUANTIZE_CHECKPOINT="${ROOT_SAVE_PATH}/auto_quantize_checkpoints/${MODEL_NAME}_${AQ_METHOD}.pth"
+    mkdir -p $(dirname $AUTO_QUANTIZE_CHECKPOINT)
+    echo "Auto-generated auto_quantize checkpoint path: $AUTO_QUANTIZE_CHECKPOINT"
+fi
+
+if [ -n "$AUTO_QUANTIZE_BITS" ]; then
+    PTQ_ARGS+=" --auto_quantize_checkpoint=$AUTO_QUANTIZE_CHECKPOINT "
+fi
+
 if [ -n "$CALIB_DATASET" ]; then
     PTQ_ARGS+=" --dataset=$CALIB_DATASET "
 fi
diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh
index 7df601327..8db2fe131 100644
--- a/examples/llm_ptq/scripts/parser.sh
+++ b/examples/llm_ptq/scripts/parser.sh
@@ -36,7 +36,7 @@ parse_options() {
     USE_SEQ_DEVICE_MAP=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
@@ -65,6 +65,9 @@ parse_options() {
       --low_memory_mode ) LOW_MEMORY_MODE=true; shift;;
       --calib_dataset ) CALIB_DATASET="$2"; shift 2;;
       --calib_seq ) CALIB_SEQ="$2"; shift 2;;
+      --auto_quantize_method ) AUTO_QUANTIZE_METHOD="$2"; shift 2;;
+      --auto_quantize_score_size ) AUTO_QUANTIZE_SCORE_SIZE="$2"; shift 2;;
+      --auto_quantize_checkpoint ) AUTO_QUANTIZE_CHECKPOINT="$2"; shift 2;;
       -- ) shift; break ;;
       * ) break ;;
     esac
@@ -150,5 +153,8 @@ parse_options() {
   echo "low_memory_mode: $LOW_MEMORY_MODE"
   echo "calib_dataset: $CALIB_DATASET"
   echo "calib_seq: $CALIB_SEQ"
+  echo "auto_quantize_method: $AUTO_QUANTIZE_METHOD"
+  echo "auto_quantize_score_size: $AUTO_QUANTIZE_SCORE_SIZE"
+  echo "auto_quantize_checkpoint: $AUTO_QUANTIZE_CHECKPOINT"
   echo "================="
 }
diff --git a/modelopt/torch/opt/hparam.py b/modelopt/torch/opt/hparam.py
index 13a9aef0d..60bde4c1e 100644
--- a/modelopt/torch/opt/hparam.py
+++ b/modelopt/torch/opt/hparam.py
@@ -48,7 +48,7 @@ def __eq__(self, other) -> bool:
 class Hparam:
     """A base hyperparameter of a DynamicModule.
 
-    An example of such a Hparam could be an hparam with identity dependencies.
+    Keeps track of hyperparameter values and their importance, which can be used for search algorithms.
     """
 
     Importance = Union[torch.Tensor, None]  # noqa: UP007
@@ -249,10 +249,14 @@ def _enforce_order(self, order: torch.Tensor | None = None) -> None:
             order = order.cpu()
         self._slice_order = order
 
+    @property
+    def attrs(self) -> list[str]:
+        """Return the attributes of the hparam for repr."""
+        return ["choices", "active", "original"]
+
     def __repr__(self) -> str:
         """Return string representation with relevant properties of the class."""
-        attrs = ["choices", "active", "original"]
-        return f"{type(self).__name__}({', '.join(f'{x}={getattr(self, x)}' for x in attrs)})"
+        return f"{type(self).__name__}({', '.join(f'{x}={getattr(self, x)}' for x in self.attrs)})"
 
     def __iand__(self, hp: "Hparam"):
         """Merge another hparam into self."""
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
index 3b99e40a9..16abc6ef4 100644
--- a/modelopt/torch/quantization/algorithms.py
+++ b/modelopt/torch/quantization/algorithms.py
@@ -19,6 +19,7 @@
 import gc
 import types
 import warnings
+from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import Callable, Sequence
 from contextlib import nullcontext
@@ -34,7 +35,7 @@
 from modelopt.torch.opt.searcher import LPS, BaseSearcher, SearchConfig, SearchStateDict
 from modelopt.torch.opt.utils import get_hparam, named_hparams
 from modelopt.torch.utils import create_param_grad_clear_hook, print_rank_0, report_memory
-from modelopt.torch.utils.distributed import DistributedProcessGroup, is_master
+from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState, is_master
 
 from . import config as mtq_config
 from . import model_calib
@@ -169,20 +170,29 @@ def fold_pqs_to_weights(model):
 class QuantRecipeHparam(Hparam):
     """An Hparam for quantization recipes.
 
-    In addition, this Hparam also:
-    1. Keeps a link to its modules and sets the quantizers for the module based on the active recipe.
-    2. Keeps track of the importance of each recipe in a dict instead of a tensor
+    See :class:`Hparam <modelopt.torch.opt.hparam.Hparam>` for more details. In addition, this Hparam also:
+
+    * Keeps a link to its ``quant_modules`` and ``score_modules`` and sets the quantizers for the
+      ``quant_modules`` based on the active recipe.
+    * Provides ``get_score()`` and ``get_cost()`` methods to evaluate recipes.
+    * Registers itself with each ``score_module`` via the ``_hparams_for_scoring`` attribute.
     """
 
     def __init__(
         self,
         choices: Sequence[QuantRecipe] | None = None,
-        nn_modules: list[nn.Module] | None = None,
+        quant_modules: list[nn.Module] | None = None,
+        score_modules: list[nn.Module] | None = None,
+        name: str | None = None,
     ) -> None:
         """Initializes Hparam with original value and choices."""
         choices = sorted({*(choices if choices else []), QuantRecipe(quant_cfg=None)})
         super().__init__(choices, original=choices[0])
-        self.nn_modules = nn_modules if nn_modules else []
+
+        self.name = name
+
+        self.quant_modules = list(set(quant_modules or []))
+        self.score_modules = list(set(score_modules or self.quant_modules))
 
         # This is a hack; We dont want to make the input_quantizer, weight_quantizer, output_quantizer
         # a dynamic attribute for backward compatibility with the model_calib.py
@@ -191,17 +201,17 @@ def __init__(
 
         quant_recipe: QuantRecipe
         for quant_recipe in self.choices:
-            for nn_module in self.nn_modules:
+            for quant_module in self.quant_modules:
                 for quantizer_attr_name in [
                     "input_quantizer",
                     "weight_quantizer",
                     "output_quantizer",
                 ]:
-                    setattr(nn_module, quantizer_attr_name, TensorQuantizer())
+                    setattr(quant_module, quantizer_attr_name, TensorQuantizer())
 
-                set_quantizer_by_cfg(nn_module, quant_recipe.config.quant_cfg)
-                self._all_quantizer_choices[quant_recipe][nn_module] = {
-                    quantizer_attr_name: getattr(nn_module, quantizer_attr_name)
+                set_quantizer_by_cfg(quant_module, quant_recipe.config.quant_cfg)
+                self._all_quantizer_choices[quant_recipe][quant_module] = {
+                    quantizer_attr_name: getattr(quant_module, quantizer_attr_name)
                     for quantizer_attr_name in [
                         "input_quantizer",
                         "weight_quantizer",
@@ -211,14 +221,17 @@ def __init__(
 
         self.active = self.original
 
+        # Importance dict is keyed by score_module (where the score is computed)
         self._importance_dict = {
-            quant_recipe: {
-                mod: torch.zeros((), device=mod.weight.device, dtype=torch.float32)
-                for mod in self.nn_modules
-            }
-            for quant_recipe in self.choices
+            quant_recipe: dict.fromkeys(self.score_modules) for quant_recipe in self.choices
         }
 
+        # Attach this hparam to each score_module's set of hparams it scores
+        for score_module in self.score_modules:
+            if not hasattr(score_module, "_hparams_for_scoring"):
+                score_module._hparams_for_scoring = set()
+            score_module._hparams_for_scoring.add(self)
+
     @property
     def active(self) -> HPType:
         """Return the currently active value."""
@@ -240,53 +253,103 @@ def active(self, val: HPType | None):
 
     @property
     def importance(self) -> dict:
-        """Return the importance dict mapping recipe and importance."""
-        return {
-            quant_recipe: sum(v.cpu().item() for v in importance_dict.values())
-            for quant_recipe, importance_dict in self._importance_dict.items()
-        }
+        """Raises an error since this is not a useful abstraction for AutoQuantize."""
+        raise NotImplementedError
+
+    def get_score(self, recipe: QuantRecipe) -> float:
+        """Get the score for a given recipe."""
+        total_score = 0
+        for score_module in self.score_modules:
+            importance = self._importance_dict[recipe][score_module]
+            if importance is None:
+                continue
 
+            parallel_state = getattr(score_module, "parallel_state", None)
 
-def _add_auto_quantize_score(grad_output, output_diff, score_tensor):
-    score_tensor += ((grad_output.float() ** 2) * (output_diff.float() ** 2)).sum()
+            if parallel_state is None:
+                total_score += importance.cpu().item()
+                continue
 
+            if parallel_state.expert_model_parallel_group.is_initialized():
+                # TODO: Support expert model parallelism for score estimation
+                warnings.warn("AutoQuantize does not support expert model parallelism yet.")
+            importance = DistributedProcessGroup.get_dist_syncd_obj(
+                importance,
+                [parallel_state.tensor_parallel_group, parallel_state.data_parallel_group],
+                sum,
+            )
+            total_score += importance.cpu().item()
+        return total_score
 
-class AutoQuantizeSearcher(BaseSearcher):
-    """A searcher for AutoQuantize algorithm.
+    def get_cost(self, recipe: QuantRecipe) -> float:
+        """Get the cost for a given recipe.
 
-    In AutoQuantize, we search for the best per-layer quantization configuration that minimizes the sum of per-layer
-    scores while meeting the specified constraint. AutoQuantize uses Linear Programming Solver to find the
-    optimal quantization configuration.
+        The cost is the total weight size of the quantizable modules multiplied by
+        the compression ratio of the recipe.
+        """
+        cost = 0
+        for quant_module in self.quant_modules:
+            weight_size = _AutoQuantizeBaseSearcher._get_total_weight_size([quant_module])
+            parallel_state = getattr(quant_module, "parallel_state", None)
 
-    The auto_quantize score for a layer quantization configuration is an approximation of model loss change change due
-    to quantizing the particular layer with the particular configuration.
-    The approximation is based on taylor expansion of the loss function wrt to the quantized output of the layer and
-    substitution of Fisher information for Hessian.
-    This approximation is mathematically correct for models where the loss
-    is a log likelihood loss such as BERT, GPT, etc. However, the auto_quantize score can still be used as a proxy
-    for other models such as ResNet.
-    """
+            if parallel_state is None:
+                cost += weight_size * recipe.compression
+                continue
+
+            if parallel_state.expert_model_parallel_group.is_initialized():
+                # TODO: Support expert model parallelism
+                warnings.warn("AutoQuantize does not support expert model parallelism yet.")
+
+            weight_size = DistributedProcessGroup.get_dist_syncd_obj(
+                weight_size,
+                [parallel_state.tensor_parallel_group],
+                sum,
+            )
+
+            # Across data parallel groups, the weight size is the same for all the ranks.
+            weight_size = DistributedProcessGroup.get_dist_syncd_obj(
+                weight_size,
+                [parallel_state.data_parallel_group],
+                lambda a: a[0],
+            )
+            cost += weight_size * recipe.compression
+
+        return cost
+
+    @property
+    def attrs(self) -> list[str]:
+        """Return the attributes of the hparam for repr."""
+        return ["name", *super().attrs]
+
+
+class _AutoQuantizeBaseSearcher(BaseSearcher, ABC):
+    """Base searcher for AutoQuantize algorithm."""
+
+    # This searcher finds optimal per-layer quantization by searching across quantization formats
+    # for each quantizable module (quant module). Optionally, quant grouping rules can restrict
+    # certain modules to share the same format. Sensitivity scores are computed from perturbations
+    # at score modules. See AutoQuantizeGradientSearcher for detailed documentation.
 
     candidate_stats: dict[str, dict[str, list[float]]]
     best: dict[str, Any]
-    custom_support: list[tuple[Callable, Callable, Callable]] = []
 
-    rules = [
+    quant_grouping_rules = [
         r"^(.*?)\.(q_proj|k_proj|v_proj)$",  # q_proj, k_proj, v_proj for llama like models
+        # gate_proj, up_proj, down_proj for Qwen3 like MoE models
+        r"^(.*?\.mlp\.experts)\.\d+\.(gate_proj|up_proj|down_proj)$",
         r"^(.*?)\.(gate_proj|up_proj)$",  # gate_proj, up_proj for llama like models
         r"^(.*?)\.(\d+\.(w1|w2|w3))$",  # mixtral experts
         r"^(.*?)\.((w1_linear|w2_linear|w3_linear)\.\d+)$",  # dbrx experts
     ]
 
+    score_module_rules = []
+
     @property
     def default_search_config(self):
         """Get the default config for the searcher."""
         return {
             "quantization_formats": ["NVFP4_DEFAULT_CFG", "FP8_DEFAULT_CFG"],
             "data_loader": None,
-            "forward_step": None,
-            "loss_func": None,
-            "forward_backward_step": None,
             "num_calib_steps": 512,
             "num_score_steps": 128,
             "deployment": None,
@@ -301,15 +364,11 @@ def default_state_dict(self) -> SearchStateDict:
         return {
             "candidate_stats": defaultdict(dict),
             "best": {"recipe": {}, "constraints": {}, "score": float("inf"), "is_satisfied": False},
-            "constraints": {},
         }
 
     def sanitize_search_config(self, config: SearchConfig | None) -> SearchConfig:
         """Sanitize the search config dict."""
         config = config or {}
-        if "score_func" in config:
-            warnings.warn("`score_func` is ignored for `auto_quantize`.")
-            config.pop("score_func")
         config = super().sanitize_search_config(config)
         assert config["data_loader"] is not None, (
             "`data_loader` must be provided for `auto_quantize`."
@@ -317,13 +376,6 @@ def sanitize_search_config(self, config: SearchConfig | None) -> SearchConfig:
         assert config["forward_step"] is not None, (
             "`forward_step` must be provided for `auto_quantize`."
         )
-
-        if config["forward_backward_step"] is None:
-            assert config["loss_func"] is not None, (
-                "`loss_func` or `forward_backward_step` must be provided for `auto_quantize`."
-            )
-            config["forward_backward_step"] = self._get_default_forward_backward_step()
-
         return config
 
     @staticmethod
@@ -343,148 +395,72 @@ def _get_search_recipes(quantization_formats):
             }
         )
 
-    @classmethod
-    def register_custom_support(
-        cls,
-        is_supported_checker: Callable,
-        grad_ckpt_context: Callable,
-        is_param_grad_enabled: Callable,
-    ):
-        """Register custom support for `AutoQuantize` score estimation.
+    def _apply_quant_group_rule(self, name: str, rule) -> str | None:
+        """Apply a single quant_group_rule to a module name.
 
-        If the `is_supported_checker(model)` returns True, the `grad_ckpt_context(model)` will be
-        used to enable gradient checkpointing and `is_param_grad_enabled(pname, model)`
-        will be used to enable gradient for the parameter.
-        """
-        cls.custom_support.append((is_supported_checker, grad_ckpt_context, is_param_grad_enabled))
+        Args:
+            name: Module name
+            rule: Either a regex pattern string or a callable that returns a unique key;
+                If callable, it should take the model and the name as input and return the unique key
 
-    def _get_default_forward_backward_step(self):
-        def forward_backward_step(model, data):
-            output = self.config["forward_step"](model, data)
-            loss = self.config["loss_func"](output, data)
-            try:
-                loss.backward()
-            except RuntimeError as e:
-                raise RuntimeError(
-                    "AutoQuantize: Error while calling `backward()` on the loss returned by `loss_func`. "
-                    "Please fix this!"
-                ) from e
-
-        return forward_backward_step
-
-    @torch.enable_grad()
-    def _estimate_auto_quantize_scores(self, is_param_grad_enabled):
-        # TODO: remove the no-quant recipe
-        def auto_quantize_score_estimate_forward(module, input, *args, **kwargs):
-            module.quant_recipe = QuantRecipe(quant_cfg=None)
-            output = module._forward_original(input, *args, **kwargs)
-
-            # If gradient checkpointing is enabled, gradient will not be enabled in the global forward pass.
-            # With gradient checkpointing, gradients are computed in the local forward pass during backward pass
-
-            # Lets compute the output_diff and save it in memory only if gradient is enabled to be memory efficient
-            if not torch.is_grad_enabled():
-                return output
-
-            module.output_diff_dict = {}
-            with torch.no_grad():
-                for recipe in module.get_hparam("quant_recipe").choices:
-                    if recipe == QuantRecipe(quant_cfg=None):
-                        continue
-                    module.quant_recipe = recipe
-                    output_diff = module._forward_original(input, *args, **kwargs)
-
-                    if isinstance(output_diff, tuple):
-                        output_diff = output_diff[0] - output[0]
-                    else:
-                        output_diff -= output
-                    module.output_diff_dict[recipe] = output_diff
-
-            return output
-
-        def backward_hook(module, grad_input, grad_output):
-            for recipe, output_diff in module.output_diff_dict.items():
-                score_tensor = module.get_hparam("quant_recipe")._importance_dict[recipe][module]
-                _add_auto_quantize_score(grad_output[0], output_diff, score_tensor)
-
-            del module.output_diff_dict
+        Returns:
+            The group key if the rule matches, None otherwise
+        """
+        if callable(rule):
+            return rule(self.model, name)
+        else:
+            # Regex pattern
+            pattern = re.compile(rule)
+            match = pattern.match(name)
+            if match:
+                return match.group(1)
+        return None
 
-        def setup_params_for_score_estimation(name, param, params_metadata, enable_grad=True):
-            # Let us delete the gradient as soon as they are computed to save memory
-            # In addition, this method enables gradient for all parameters
-            # This is needed to make sure the re-entrant activation checkpointing works
-            params_metadata[name] = {"requires_grad": param.requires_grad}
-            param.requires_grad = enable_grad
-            if not enable_grad:
-                return
-            if self.config.get("verbose", False):
-                print_rank_0(f"AutoQuantize: Enabling gradient for param {name}.")
-            accum_grad, handle = create_param_grad_clear_hook(param)
-            params_metadata[name]["accum_grad"] = accum_grad  # We need to keep the accum_grad alive
-            params_metadata[name]["handle"] = handle
+    def _apply_score_group_rule(self, name: str, rule) -> str | None:
+        """Apply a single score_group_rule to a module name.
 
-        def setup_module_for_score_estimation(module):
-            module._forward_original = module.forward
-            module.forward = types.MethodType(auto_quantize_score_estimate_forward, module)
-            module._backward_hook_handle = module.register_full_backward_hook(backward_hook)
+        Args:
+            name: Module name
+            rule: Either a regex pattern string or a callable that returns the score module name.
+                If callable, it should take the model and the name as input and return the score module name
 
-        def cleanup_module_after_score_estimation(module):
-            module.forward = module._forward_original
-            del module._forward_original
+        Returns:
+            The score module name if the rule matches, None otherwise
+        """
+        if callable(rule):
+            return rule(self.model, name)
+        else:
+            # Regex pattern - return the matched name or full match
+            pattern = re.compile(rule)
+            match = pattern.match(name)
+            if match:
+                # For score rules, return the full match or first group
+                return match.group(0) if match.lastindex is None else match.group(1)
+        return None
 
-            module._backward_hook_handle.remove()
+    def _get_score_module_from_name(
+        self, model: nn.Module, score_module_name: str, quant_module: nn.Module
+    ) -> nn.Module:
+        """Get the actual score module object from its name.
 
-        def cleanup_params_after_score_estimation(name, param, params_metadata):
-            param.requires_grad = params_metadata[name]["requires_grad"]
-            handle = params_metadata[name].get("handle", None)
-            if handle is not None:
-                handle.remove()
+        Args:
+            model: The model containing all modules
+            score_module_name: The name of the score module to retrieve
+            quant_module: The quantized module for which the score is estimated
 
-        for name, module in self.model.named_modules():
-            if (
-                self._is_auto_quantize_module(module)
-                and module.get_hparam("quant_recipe").is_configurable
-            ):
-                # Monkey patch the forward methods to cache Y(Q(W), Q(X)) - Y(W,X)
-                setup_module_for_score_estimation(module)
-
-        params_metadata = {}
-
-        for name, param in self.model.named_parameters():
-            setup_params_for_score_estimation(
-                name, param, params_metadata, is_param_grad_enabled(name, self.model)
+        Returns:
+            The score module object, or the quantized module itself if the score module is not found
+        """
+        try:
+            score_module = model.get_submodule(score_module_name)
+            return score_module
+        except AttributeError:
+            warnings.warn(
+                f"Score module '{score_module_name}' not found. Score will estimated from the quantized module itself."
             )
+            return quant_module
 
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.reset_peak_memory_stats()
-            report_memory("AutoQuantize: starting score estimation, ")
-
-        self._run_func(
-            self.config["forward_backward_step"],
-            num_iters=self.config["num_score_steps"],
-            desc="Estimating auto_quantize scores",
-        )
-
-        if torch.cuda.is_available():
-            report_memory("AutoQuantize: After score estimation")
-
-        for name, module in self.model.named_modules():
-            if (
-                self._is_auto_quantize_module(module)
-                and module.get_hparam("quant_recipe").is_configurable
-            ):
-                cleanup_module_after_score_estimation(module)
-
-        for name, param in self.model.named_parameters():
-            cleanup_params_after_score_estimation(name, param, params_metadata)
-
-        # Delete the params_metadata
-        del params_metadata
-        gc.collect()
-
-    @classmethod
-    def insert_hparams_after_merge_rules(cls, model, quant_recipes, disabled_layers=None):
+    def insert_hparams_after_merge_rules(self, model, quant_recipes, disabled_layers=None):
         """Restrict the search space using the merge rules and insert the hparams for the model."""
         # TRTLLM fuses linear layers such as q_proj, k_proj, v_proj into same layer
         # Hence we need to restrict the search space so that all these layers share the same recipe
@@ -495,9 +471,11 @@ def insert_hparams_after_merge_rules(cls, model, quant_recipes, disabled_layers=
         elif isinstance(disabled_layers, str):
             disabled_layers = [disabled_layers]
 
-        search_map: dict[str, list[tuple[nn.Module, bool]]] = {}
+        # Map from group key to list of (quant_module, name, disabled, score_module)
+        search_map: dict[str, list[tuple[nn.Module, str, bool, nn.Module]]] = {}
+
         for name, module in model.named_modules():
-            if not cls._is_auto_quantize_module(module):
+            if not self._is_auto_quantize_module(module):
                 continue
 
             # Skip layers that match disabled_layers patterns
@@ -507,28 +485,46 @@ def insert_hparams_after_merge_rules(cls, model, quant_recipes, disabled_layers=
                     disabled = True
                     break
 
-            prefix = name
-            for rule in cls.rules:
-                pattern = re.compile(rule)
-                match = pattern.match(name)
-                if match:
-                    prefix = match.group(1)
+            # Apply quant_grouping_rules to determine the group key
+            group_key = name  # Default: each module in its own group
+            for rule in self.quant_grouping_rules:
+                result = self._apply_quant_group_rule(name, rule)
+                if result is not None:
+                    group_key = result
                     # We support only one rule for matching per module
                     break
-            if prefix not in search_map:
-                search_map[prefix] = [(module, disabled)]
+
+            # Apply score_module_rules to determine the score module name, then get the actual module
+            score_module_name = name  # Default: score from same module
+            for rule in self.score_module_rules:
+                result = self._apply_score_group_rule(name, rule)
+                if result is not None:
+                    score_module_name = result
+                    # We support only one rule for matching per module
+                    break
+
+            # Get the actual score module object immediately
+            score_module = self._get_score_module_from_name(model, score_module_name, module)
+
+            if group_key not in search_map:
+                search_map[group_key] = [(module, name, disabled, score_module)]
             else:
-                search_map[prefix].append((module, disabled))
-
-        for prefix, module_info_list in search_map.items():
-            modules = [module for module, _ in module_info_list]
-            disabled = any(disabled for _, disabled in module_info_list)
-            hparam = (
-                QuantRecipeHparam(None, nn_modules=modules)
-                if disabled
-                else QuantRecipeHparam(quant_recipes, nn_modules=modules)
+                search_map[group_key].append((module, name, disabled, score_module))
+
+        for group_key, module_info_list in search_map.items():
+            quant_modules = [module for module, _, _, _ in module_info_list]
+            disabled = any(disabled for _, _, disabled, _ in module_info_list)
+            score_modules = [score_module for _, _, _, score_module in module_info_list]
+
+            quant_recipes = None if disabled else quant_recipes
+            hparam = QuantRecipeHparam(
+                quant_recipes,
+                quant_modules=quant_modules,
+                score_modules=score_modules,
+                name=str(group_key),
             )
-            for module in modules:
+
+            for module in quant_modules:
                 module._register_hparam("quant_recipe", hparam)
 
     def _get_formatted_weight_compression_constraint(self):
@@ -547,6 +543,33 @@ def _verify_constraint(self, search_recipes):
             f"{search_recipes[0]} whose num_bits = {search_recipes[0].num_bits}."
         )
 
+    @abstractmethod
+    def estimate_sensitivity_scores(self) -> None:
+        """Estimate sensitivity scores and track them with Hparam."""
+
+    def initialize_candidate_stats(self):
+        """Initialize the candidate stats for the model."""
+        for name, hparam in named_hparams(self.model, unique=True):
+            if not isinstance(hparam, QuantRecipeHparam):
+                continue
+
+            formats, scores, costs = [], [], []
+            prev_score = float("inf")
+            for recipe in hparam.choices:
+                formats.append(recipe)
+
+                score = hparam.get_score(recipe)  # type: ignore [arg-type]
+                cost = hparam.get_cost(recipe)  # type: ignore [arg-type]
+
+                score = min(score, prev_score)  # TODO: Should we get rid of this?
+                scores.append(score)
+                costs.append(cost)
+                prev_score = score
+
+            self.candidate_stats[name]["formats"] = formats
+            self.candidate_stats[name]["scores"] = scores
+            self.candidate_stats[name]["costs"] = costs
+
     def _run_func(self, func, num_iters=1, desc=""):
         for i, data in tqdm(
             zip(range(num_iters), self.config["data_loader"]),
@@ -572,7 +595,7 @@ def before_search(self):
 
         # Iterate over the search recipes and calibrate the quantizers for each recipe
         for recipe in search_recipes:
-            if recipe.compression >= 1.0:
+            if recipe == QuantRecipe(quant_cfg=None):  # No-quant format
                 continue
 
             # Lets reduce the number of calibration steps for AWQ since it takes longer
@@ -605,6 +628,330 @@ def forward_loop(model):
             # TODO: This is a hack. We need to create a mode for auto_quantize to handle this in a clean way.
             ModeloptStateManager(self.model).state_dict().pop()
 
+        if self.candidate_stats:
+            if self.config["verbose"]:
+                print_rank_0("AutoQuantize: Restored from checkpoint, skipping scoring")
+            return
+
+        self.estimate_sensitivity_scores()
+        self.initialize_candidate_stats()
+        # Save checkpoint after successful score estimation
+        self.save_search_checkpoint(verbose=self.config["verbose"])
+
+    @staticmethod
+    def _get_total_weight_size(modules):
+        return sum(
+            (
+                module.weight.numel()
+                if _AutoQuantizeBaseSearcher._is_auto_quantize_module(module)
+                else 0
+            )
+            for module in modules
+        )
+
+    def _get_constraints_for_search(self, max_weight_size, lower_bound=None):
+        constraints = {
+            "weight_size_after_compression": (
+                lower_bound * max_weight_size if lower_bound else lower_bound,
+                max_weight_size,
+            )
+        }
+        return constraints, "weight_size_after_compression"
+
+    @abstractmethod
+    def run_search_with_stats(self, max_weight_size, verbose=False):
+        """Run the search with stats to get the best recipe and whether the constraints are satisfied."""
+
+    def run_search(self):
+        """Search for the best per-layer quantization configuration and return the best model and configuration."""
+        verbose = self.config["verbose"]
+        assert len(self.constraints) == 1 and "effective_bits" in self.constraints, (
+            f"`constraints` must contain only 'effective_bits' constraint. "
+            f"Got {self.constraints.keys()}"
+        )
+
+        compression = self._get_formatted_weight_compression_constraint()
+        total_weight_size = self._get_total_weight_size(self.model.modules())
+        max_weight_size = total_weight_size * compression
+
+        # Run the search with stats to get the best recipe and whether the constraints are satisfied
+        best_recipe_info, is_satisfied = self.run_search_with_stats(max_weight_size, verbose)
+        self.best["is_satisfied"] = is_satisfied
+
+        best_recipe = {}
+        best_constraints, best_scores = 0, 0
+        for name, best_hparam_recipe_info in best_recipe_info.items():
+            # Solvers could give different solutions for the same layer across DP/TP groups even though
+            # the scores and costs are the same. Lets make sure the same recipe is selected across DP/TP
+            _ps = self.model.get_submodule(name.split(".quant_recipe")[0]).parallel_state
+            best_format = DistributedProcessGroup.get_dist_syncd_obj(
+                best_hparam_recipe_info["format"],
+                [_ps.data_parallel_group, _ps.tensor_parallel_group],
+                lambda a: a[0],
+            )
+
+            best_recipe[name] = best_format
+            get_hparam(self.model, name).active = best_format
+            best_constraints += best_hparam_recipe_info["costs"]
+            best_scores += best_hparam_recipe_info["scores"]
+            if verbose:
+                print_rank_0(
+                    f"AutoQuantize best recipe for {name.replace('.quant_recipe', '')}: {best_recipe[name]}"
+                )
+
+        effective_bits_from_search = (best_constraints / total_weight_size) * 16
+        if verbose:
+            print_rank_0(
+                f"AutoQuantize effective bits from search: {effective_bits_from_search: .2f}"
+            )
+
+        self.best["recipe"] = best_recipe
+        self.best["constraints"] = {"effective_bits": effective_bits_from_search}
+        self.best["score"] = best_scores
+
+        QuantRecipe.fold_pqs_to_weights(self.model)
+
+
+def _get_auto_quantize_score(grad_output, output_diff):
+    return ((grad_output.float() ** 2) * (output_diff.float() ** 2)).sum()
+
+
+def _add_auto_quantize_score(grad_output, output_diff, score_tensor):
+    score_tensor += _get_auto_quantize_score(grad_output, output_diff)
+
+
+class AutoQuantizeGradientSearcher(_AutoQuantizeBaseSearcher):
+    """A searcher for AutoQuantize algorithm that uses gradient based score estimation.
+
+    In AutoQuantize, we search for the best per-layer quantization configuration that minimizes the sum of per-layer
+    scores while meeting the specified constraint. AutoQuantize uses Linear Programming Solver to find the
+    optimal quantization configuration.
+
+    The auto_quantize score for a layer quantization configuration is an approximation of model loss change due
+    to quantizing the particular layer with the particular configuration.
+    The approximation is based on taylor expansion of the loss function wrt to the quantized output of the layer and
+    substitution of Fisher information for Hessian.
+    This approximation is mathematically correct for models where the loss
+    is a log likelihood loss such as BERT, GPT, etc. However, the auto_quantize score can still be used as a proxy
+    for other models such as ResNet.
+
+    **Quant Modules:**
+
+    This searcher operates on quantizable modules (quant modules), which are typically Linear or Conv layers
+    that support quantization. Optionally, grouping rules can be applied to ensure certain layers share the same
+    quantization format (e.g., Q, K, V projections in the same attention layer). For details on quant_grouping_rules
+    and customization, see the :meth:`auto_quantize <modelopt.torch.quantization.model_quant.auto_quantize>`
+    API documentation.
+
+    **Score Modules:**
+
+    By default, for each quant module, its sensitivity score is estimated using that module's output perturbation.
+    However, the sensitivity can also be estimated by looking at perturbation at a separate point in the neural
+    network (score module). This is helpful in some cases such as MoEs for speed and lower memory consumption.
+    Since all experts are already restricted to the same quant format by quant grouping rules, their sensitivity
+    can be estimated together at a single point (e.g., the MLP output level).
+    """
+
+    score_module_rules = [
+        # Use MLP layer output for gate_proj, up_proj, down_proj for Qwen3 like MoE models (local and shared experts)
+        r"^(.*?\.mlp\.experts)\.\d+\.(gate_proj|up_proj|down_proj)$",
+        r"^(.*?)\.(\d+\.(w1|w2|w3))$",  # mixtral experts
+        r"^(.*?)\.((w1_linear|w2_linear|w3_linear)\.\d+)$",  # dbrx experts
+    ]
+
+    # See `register_custom_support` for details
+    _custom_support: list[tuple[Callable, Callable, Callable]] = []
+
+    @property
+    def default_search_config(self):
+        """Get the default config for the searcher."""
+        config = super().default_search_config
+        config.update(
+            {
+                "forward_step": None,
+                "loss_func": None,
+                "forward_backward_step": None,
+            }
+        )
+        return config
+
+    def sanitize_search_config(self, config: SearchConfig | None) -> SearchConfig:
+        """Sanitize the search config dict."""
+        config = config or {}
+        if "score_func" in config:
+            warnings.warn("`score_func` is ignored for gradient based `auto_quantize`.")
+            config.pop("score_func")
+        config = super().sanitize_search_config(config)
+        if config["forward_backward_step"] is None:
+            assert config["loss_func"] is not None, (
+                "`loss_func` or `forward_backward_step` must be provided for `auto_quantize`."
+            )
+            config["forward_backward_step"] = self._get_default_forward_backward_step()
+
+        return config
+
+    @classmethod
+    def register_custom_support(
+        cls,
+        is_supported_checker: Callable,
+        grad_ckpt_context: Callable,
+        is_param_grad_enabled: Callable,
+    ) -> None:
+        """(Optional) Register custom support for `AutoQuantize` score estimation.
+
+        This custom support is used to enable memory/compute efficient backward gradient propagation. This involves:
+
+        - `grad_ckpt_context`: backward pass with gradient checkpointing enabled
+        - `is_param_grad_enabled`: AutoQuantize only needs activation gradients to be computed (not weight
+          gradients). `is_param_grad_enabled` is used to select which parameters should have gradients enabled,
+          limiting gradient computation to only what's needed for activation gradients. For LLMs, to trigger all
+          activation gradient computation, just enabling the embedding layer weight gradient is sufficient. This will
+          enable gradient computation for all the activation gradients downstream.
+
+        If the `is_supported_checker(model)` returns True, the `grad_ckpt_context(model)` will be
+        used to enable gradient checkpointing and `is_param_grad_enabled(pname, model)`
+        will be used to select which parameters have gradients enabled to minimize gradient computation.
+        """
+        cls._custom_support.append((is_supported_checker, grad_ckpt_context, is_param_grad_enabled))
+
+    def _get_default_forward_backward_step(self):
+        def forward_backward_step(model, data):
+            output = self.config["forward_step"](model, data)
+            loss = self.config["loss_func"](output, data)
+            try:
+                loss.backward()
+            except RuntimeError as e:
+                raise RuntimeError(
+                    "AutoQuantize: Error while calling `backward()` on the loss returned by `loss_func`. "
+                    "Please fix this!"
+                    f"error: {e}"
+                ) from e
+
+        return forward_backward_step
+
+    @torch.enable_grad()
+    def _estimate_auto_quantize_scores(self, is_param_grad_enabled):
+        # TODO: remove the no-quant recipe
+        def auto_quantize_score_estimate_forward(module, input, *args, **kwargs):
+            for hparam in module._hparams_for_scoring:
+                if hparam.is_configurable:
+                    hparam.active = QuantRecipe(quant_cfg=None)
+
+            output = module._forward_original(input, *args, **kwargs)
+
+            # If gradient checkpointing is enabled, gradient will not be enabled in the global forward pass.
+            # With gradient checkpointing, gradients are computed in the local forward pass during backward pass
+
+            # Lets compute the output_diff and save it in memory only if gradient is enabled to be memory efficient
+            if not torch.is_grad_enabled():
+                return output
+
+            module.output_diff_dict = {hparam: {} for hparam in module._hparams_for_scoring}
+            with torch.no_grad():
+                for hparam in module._hparams_for_scoring:
+                    if not hparam.is_configurable:
+                        continue
+                    for recipe in hparam.choices:
+                        if recipe == QuantRecipe(quant_cfg=None):
+                            continue
+                        hparam.active = recipe
+                        output_diff = module._forward_original(input, *args, **kwargs)
+
+                        if isinstance(output_diff, tuple):
+                            output_diff = output_diff[0] - output[0]
+                        else:
+                            output_diff -= output
+                        module.output_diff_dict[hparam][recipe] = output_diff.detach()
+
+                    # Disable the configurable hparam now that we have computed the diff
+                    hparam.active = QuantRecipe(quant_cfg=None)
+
+            return output
+
+        def backward_hook(module, grad_input, grad_output):
+            for hparam, output_diff_dict in module.output_diff_dict.items():
+                for recipe, output_diff in output_diff_dict.items():
+                    if hparam._importance_dict[recipe][module] is None:
+                        hparam._importance_dict[recipe][module] = _get_auto_quantize_score(
+                            grad_output[0], output_diff
+                        )
+                    else:
+                        _add_auto_quantize_score(
+                            grad_output[0], output_diff, hparam._importance_dict[recipe][module]
+                        )
+
+        def setup_params_for_score_estimation(name, param, params_metadata, enable_grad=True):
+            # Let us delete the gradient as soon as they are computed to save memory
+            params_metadata[name] = {"requires_grad": param.requires_grad}
+            param.requires_grad = enable_grad
+            if not enable_grad:
+                return
+            if self.config.get("verbose", False):
+                print_rank_0(f"AutoQuantize: Enabling gradient for param {name}.")
+            accum_grad, handle = create_param_grad_clear_hook(param)
+            params_metadata[name]["accum_grad"] = accum_grad  # We need to keep the accum_grad alive
+            params_metadata[name]["handle"] = handle
+
+        def setup_module_for_score_estimation(module):
+            module._forward_original = module.forward
+            module.forward = types.MethodType(auto_quantize_score_estimate_forward, module)
+            module._backward_hook_handle = module.register_full_backward_hook(backward_hook)
+
+        def cleanup_module_after_score_estimation(module):
+            module.forward = module._forward_original
+            del module._forward_original
+
+            module._backward_hook_handle.remove()
+
+        def cleanup_params_after_score_estimation(name, param, params_metadata):
+            param.requires_grad = params_metadata[name]["requires_grad"]
+            handle = params_metadata[name].get("handle", None)
+            if handle is not None:
+                handle.remove()
+
+        score_modules = set()
+        for name, module in self.model.named_modules():
+            if (
+                hasattr(module, "_hparams_for_scoring")
+                and any(hparam.is_configurable for hparam in module._hparams_for_scoring)
+                and module not in score_modules
+            ):
+                # Monkey patch the forward methods to cache (Q(Y) - Y)
+                setup_module_for_score_estimation(module)
+                score_modules.add(module)
+
+        params_metadata = {}
+        for name, param in self.model.named_parameters():
+            setup_params_for_score_estimation(
+                name, param, params_metadata, is_param_grad_enabled(name, self.model)
+            )
+
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+            report_memory("AutoQuantize: starting score estimation, ")
+
+        self._run_func(
+            self.config["forward_backward_step"],
+            num_iters=self.config["num_score_steps"],
+            desc="Estimating auto_quantize scores",
+        )
+
+        if torch.cuda.is_available():
+            report_memory("AutoQuantize: After score estimation")
+
+        for module in score_modules:
+            cleanup_module_after_score_estimation(module)
+
+        for name, param in self.model.named_parameters():
+            cleanup_params_after_score_estimation(name, param, params_metadata)
+
+        # Delete the params_metadata
+        del params_metadata
+        gc.collect()
+
+    def estimate_sensitivity_scores(self) -> None:
+        """Estimate sensitivity scores using hessian approximation."""
         self.model.eval()
 
         def _default_is_param_grad_enabled(pname, model):
@@ -612,7 +959,7 @@ def _default_is_param_grad_enabled(pname, model):
 
         grad_checkpointing_ctxt = None
         is_param_grad_enabled = _default_is_param_grad_enabled
-        for is_supported_checker, ctxt_candidate, grad_enabled_candidate in self.custom_support:
+        for is_supported_checker, ctxt_candidate, grad_enabled_candidate in self._custom_support:
             if is_supported_checker(self.model):
                 grad_checkpointing_ctxt = ctxt_candidate
                 is_param_grad_enabled = grad_enabled_candidate
@@ -621,73 +968,21 @@ def _default_is_param_grad_enabled(pname, model):
         with grad_checkpointing_ctxt(self.model) if grad_checkpointing_ctxt else nullcontext():
             self._estimate_auto_quantize_scores(is_param_grad_enabled)
 
-    def run_search(self):
-        """Search for the best per-layer quantization configuration and return the best model and configuration.
+    def run_search_with_stats(self, max_weight_size, verbose=False):
+        """Linear Programming Solve for gradient based auto_quantize.
 
         AutoQuantize uses Linear Programming Solver to find the optimal quantization configuration which
         minimizes the sum of per-layer auto_quantize scores while meeting the specified constraint.
         """
-
-        def get_total_weight_size(modules):
-            return sum(
-                (module.weight.numel() if self._is_auto_quantize_module(module) else 0)
-                for module in modules
-            )
-
-        def _get_constraints_for_search(max_weight_size, lower_bound=None):
-            constraints = {
-                "weight_size_after_compression": (
-                    lower_bound * max_weight_size if lower_bound else lower_bound,
-                    max_weight_size,
-                )
-            }
-            return constraints, "weight_size_after_compression"
-
-        verbose = self.config["verbose"]
-        assert len(self.constraints) == 1 and "effective_bits" in self.constraints, (
-            f"`constraints` must contain only 'effective_bits' constraint. "
-            f"Got {self.constraints.keys()}"
-        )
-
-        compression = self._get_formatted_weight_compression_constraint()
-        total_weight_size = get_total_weight_size(self.model.modules())
-        weight_size_after_compression = total_weight_size * compression
-
-        for name, hparam in named_hparams(self.model, unique=True):
-            if not isinstance(hparam, QuantRecipeHparam):
-                continue
-
-            formats, scores, costs = [], [], []
-            prev_score = float("inf")
-            for recipe in hparam.choices:
-                formats.append(recipe)
-                score = hparam.importance[recipe]
-                cost = get_total_weight_size(hparam.nn_modules) * recipe.compression  # type: ignore [union-attr]
-
-                # Lets get the score across Data Parallel (DP) and Tensor Parallel (TP) groups
-                # This way we constraint the same quantization format for the same layer across the DP/TP groups
-                # The cost we use here is weight size. They are the same across DP/TP groups.
-                _ps = self.model.get_submodule(name.split(".quant_recipe")[0]).parallel_state
-                # The score is the sum of the scores across DP and TP groups
-                score = DistributedProcessGroup.get_dist_syncd_obj(
-                    score, [_ps.data_parallel_group, _ps.tensor_parallel_group], sum
-                )
-
-                scores.append(min(score, prev_score))
-                costs.append(cost)
-                prev_score = score
-
-            self.candidate_stats[name]["formats"] = formats
-            self.candidate_stats[name]["scores"] = scores
-            self.candidate_stats[name]["costs"] = costs
+        # TODO: Do this only for rank 0 in the respective pipeline group
 
         for lower_bound in [None, 0.99, 0.90]:
             # The LP solver for auto_quantize sometimes fails to find a solution if a lower bound is not
             # specified. I dont know why this happens.
             # As a workaround, lets specify a lower bound for the weight compression if previous
             # search without lower bound fails.
-            constraints, constraint_name = _get_constraints_for_search(
-                weight_size_after_compression, lower_bound
+            constraints, constraint_name = self._get_constraints_for_search(
+                max_weight_size, lower_bound
             )
 
             lps = LPS(
@@ -708,47 +1003,251 @@ def _get_constraints_for_search(max_weight_size, lower_bound=None):
             if self.status == "Optimal":
                 break
 
-        self.best = {}
-
         if self.status != "Optimal":
             warnings.warn(
                 "AutoQuantize FAILED to find a solution! The searched model might not meet all constraints. "
             )
-            self.best["is_satisfied"] = False
+            is_satisfied = False
         else:
-            self.best["is_satisfied"] = True
+            is_satisfied = True
 
-        best_recipe = {}
-        best_constraints, best_scores = 0, 0
+        best_recipes = {}
         for name, selected_idx in zip(self.candidate_stats.keys(), selections):
-            best_recipe_for_name = self.candidate_stats[name]["formats"][selected_idx]
+            best_recipes[name] = {
+                "format": self.candidate_stats[name]["formats"][selected_idx],
+                "costs": self.candidate_stats[name]["costs"][selected_idx],
+                "scores": self.candidate_stats[name]["scores"][selected_idx],
+            }
 
-            # LP solver could give different solutions for the same layer across DP/TP groups even though
-            # the scores and costs are the same. Lets make sure the same quantization format is selected across DP/TP
-            _ps = self.model.get_submodule(name.split(".quant_recipe")[0]).parallel_state
-            best_recipe_for_name = DistributedProcessGroup.get_dist_syncd_obj(
-                best_recipe_for_name,
-                [_ps.data_parallel_group, _ps.tensor_parallel_group],
-                lambda a: a[0],
+        return best_recipes, is_satisfied
+
+
+# TODO: Enable torch compile for this function
+# Currently modelopt.onnx is breaking this
+def _get_softmax_dist(
+    logits: torch.Tensor, tp_group, return_log_prob: bool = False
+) -> torch.Tensor:
+    # TODO: test this
+    dtype = logits.dtype
+    max_logits = torch.amax(logits, dim=-1, keepdim=True)
+    torch.distributed.all_reduce(max_logits, op=torch.distributed.ReduceOp.MAX, group=tp_group)
+    logits = (logits - max_logits).float()
+    sum_exp_logits = torch.exp(torch.logsumexp(logits, dim=-1, keepdim=True))
+    torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=tp_group)
+    logits = logits - torch.log(sum_exp_logits)
+    if return_log_prob:
+        return logits.to(dtype)
+    else:
+        return torch.exp(logits).to(dtype)
+
+
+def _get_softmax(logits: torch.Tensor, return_log_prob: bool = False) -> torch.Tensor:
+    # TODO: do we need to do log_softmax in float32?
+    # log_softmax is supposed to be numerically stable implementation
+    log_prob = torch.log_softmax(logits.float(), dim=-1)
+    if return_log_prob:
+        return log_prob
+    else:
+        return torch.exp(log_prob)
+
+
+def _get_p_log_q(p: torch.Tensor, log_q: torch.Tensor) -> torch.Tensor:
+    return torch.sum(p * log_q).float()
+
+
+def _get_prob_from_logits(
+    logits: torch.Tensor, return_log_prob: bool = False, lm_head: nn.Module = None
+) -> torch.Tensor:
+    parallel_state: ParallelState | None = (
+        getattr(lm_head, "parallel_state", None) if lm_head is not None else None
+    )
+    if parallel_state is not None and parallel_state.tensor_parallel_group.is_initialized():
+        return _get_softmax_dist(
+            logits, parallel_state.tensor_parallel_group.group, return_log_prob
+        )
+    return _get_softmax(logits, return_log_prob)
+
+
+def _get_kl_div_loss(
+    prob_unquant: torch.Tensor, logits_quant: torch.Tensor, lm_head: nn.Module = None
+) -> torch.Tensor:
+    log_prob_quant = _get_prob_from_logits(logits_quant, return_log_prob=True, lm_head=lm_head)
+    # We dont need to calculate the full kl div loss here, just get - p*log_q
+    return -_get_p_log_q(prob_unquant, log_prob_quant)
+
+
+def _get_lm_head(model: nn.Module) -> nn.Module:
+    # HF models do allgather of logits to at lm_head
+    # Hence lm_head outputs are not TP sharded - so we dont need to return the lm_head for TP KLDiv
+    # Loss
+    for name, module in model.named_modules():
+        if name.endswith("output_layer"):  # Megatron models
+            return module
+    return None
+
+
+class AutoQuantizeKLDivSearcher(_AutoQuantizeBaseSearcher):
+    """A searcher for AutoQuantize algorithm that uses KL-Divergence loss based score estimation."""
+
+    @property
+    def default_search_config(self):
+        """Get the default config for the searcher."""
+        config = super().default_search_config
+        config.update(
+            {
+                "forward_step": None,
+            }
+        )
+        return config
+
+    def sanitize_search_config(self, config: SearchConfig | None) -> SearchConfig:
+        """Sanitize the search config dict."""
+        config = config or {}
+        for ignored_key in ["score_func", "loss_func", "forward_backward_step"]:
+            if ignored_key in config:
+                if config[ignored_key] is not None:
+                    warnings.warn(
+                        f"`{ignored_key}` is ignored for KL-Divergence loss based `auto_quantize`."
+                    )
+                config.pop(ignored_key)
+        config = super().sanitize_search_config(config)
+        assert config["forward_step"] is not None, (
+            "`forward_step` must be provided for KL-Divergence loss based `auto_quantize`. "
+            "`forward_step(model, data)` should return model logits."
+        )
+        return config
+
+    @torch.inference_mode()
+    def estimate_sensitivity_scores(self):
+        """Estimate the sensitivity scores for the model.
+
+        Higher score means more sensitive to quantization.
+        """
+
+        def set_to_unquantized():
+            for name, hparam in named_hparams(self.model, unique=True):
+                if not isinstance(hparam, QuantRecipeHparam):
+                    continue
+                if hparam.is_configurable:
+                    hparam.active = QuantRecipe(quant_cfg=None)
+
+        self.model.eval()
+        num_iters = self.config["num_score_steps"]
+        for _, data in tqdm(
+            zip(range(num_iters), self.config["data_loader"]),
+            desc="Estimating KLDivergence loss",
+            total=num_iters,
+        ):
+            set_to_unquantized()
+            logits_unquant = self.config["forward_step"](self.model, data)
+            prob_unquant = _get_prob_from_logits(
+                logits_unquant,
+                return_log_prob=False,
+                lm_head=_get_lm_head(self.model),
             )
 
-            best_recipe[name] = best_recipe_for_name
-            get_hparam(self.model, name).active = best_recipe_for_name
-            best_constraints += self.candidate_stats[name]["costs"][selected_idx]
-            best_scores += self.candidate_stats[name]["scores"][selected_idx]
+            for name, hparam in tqdm(
+                list(named_hparams(self.model, configurable=True)), desc="Evaluating hparams"
+            ):
+                if not isinstance(hparam, QuantRecipeHparam):
+                    continue
+                for recipe in hparam.choices:
+                    if recipe == QuantRecipe(quant_cfg=None):
+                        continue
+                    hparam.active = recipe
+                    logits_quant = self.config["forward_step"](self.model, data)
+                    score = _get_kl_div_loss(prob_unquant, logits_quant, _get_lm_head(self.model))
+                    if hparam._importance_dict[recipe][hparam.score_modules[0]] is None:
+                        hparam._importance_dict[recipe][hparam.score_modules[0]] = score
+                    else:
+                        hparam._importance_dict[recipe][hparam.score_modules[0]] += score
+                hparam.active = QuantRecipe(quant_cfg=None)
+
+    def run_search_with_stats(self, max_weight_size, verbose=False):
+        """Run threshold-based binary search for KLDivergence loss based auto_quantize.
+
+        We use binary search to minimize the max(per-layer score) while meeting the constraint.
+        """
+        # Collect all sensitivity scores to determine initial threshold bounds
+        all_scores = [
+            score for name in self.candidate_stats for score in self.candidate_stats[name]["scores"]
+        ]
+
+        if not all_scores:
+            warnings.warn("No scores available for threshold-based search!")
+            is_satisfied = False
+            return {}, is_satisfied
+
+        # Initialize binary search bounds
+        min_score = min(all_scores)
+        max_score = max(all_scores)
+        threshold = (min_score + max_score) / 2.0
+        lower_bound = min_score
+        upper_bound = max_score
+
+        # Run for fixed number of iterations
+        max_iterations = 100
+
+        if verbose:
+            print_rank_0("AutoQuantize: Starting threshold-based binary search")
+            print_rank_0(f"  Score range: [{min_score:.6e}, {max_score:.6e}]")
+            print_rank_0(f"  Target weight size: {max_weight_size:.2f}")
+
+        for iteration in range(max_iterations):
+            # Select recipes based on current threshold
+            best_recipes = {}
+            total_weight_size = 0.0
+
+            for name in self.candidate_stats:
+                formats = self.candidate_stats[name]["formats"]
+                scores = self.candidate_stats[name]["scores"]
+                costs = self.candidate_stats[name]["costs"]
+
+                selected_idx = 0
+                for idx in range(len(formats)):
+                    if scores[idx] <= threshold:
+                        selected_idx = idx
+                        break
+
+                best_recipes[name] = {
+                    "format": formats[selected_idx],
+                    "costs": costs[selected_idx],
+                    "scores": scores[selected_idx],
+                }
+                total_weight_size += costs[selected_idx]
+
+            # Check if we meet the constraint
+            meets_constraint = total_weight_size <= max_weight_size
+
             if verbose:
                 print_rank_0(
-                    f"AutoQuantize best recipe for {name.replace('.quant_recipe', '')}: {best_recipe[name]}"
+                    f"  Iteration {iteration + 1}: threshold={threshold:.6e}, "
+                    f"weight_size={total_weight_size:.2f}, "
+                    f"meets_constraint={meets_constraint}"
                 )
 
-        effective_bits_from_search = (best_constraints / total_weight_size) * 16
+            # Update binary search bounds
+            if meets_constraint:
+                upper_bound = threshold  # Threshold was too aggressive, relax it
+            else:
+                lower_bound = threshold  # Threshold was too lax, tighten it
+
+            # Update threshold for next iteration
+            threshold = (lower_bound + upper_bound) / 2.0
+
+        # Final check if constraint is satisfied
+        is_satisfied = total_weight_size <= max_weight_size
+
         if verbose:
             print_rank_0(
-                f"AutoQuantize effective bits from search: {effective_bits_from_search: .2f}"
+                f"AutoQuantize: Search complete. "
+                f"Final weight size: {total_weight_size:.2f} "
+                f"(target: {max_weight_size:.2f}), "
+                f"constraint satisfied: {is_satisfied}"
             )
 
-        self.best["recipe"] = best_recipe
-        self.best["constraints"] = {"effective_bits": effective_bits_from_search}
-        self.best["score"] = best_scores
+        return best_recipes, is_satisfied
 
-        QuantRecipe.fold_pqs_to_weights(self.model)
+
+# Backward compatibility alias (defaults to gradient-based searcher)
+AutoQuantizeSearcher = AutoQuantizeGradientSearcher
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index deace8e0c..4a2b74a30 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -31,7 +31,7 @@
 from modelopt.torch.quantization.config import QuantizeConfig
 from modelopt.torch.quantization.conversion import set_quantizer_by_cfg
 
-from .algorithms import AutoQuantizeSearcher, QuantRecipe
+from .algorithms import AutoQuantizeGradientSearcher, AutoQuantizeKLDivSearcher, QuantRecipe
 from .config import QuantizeAlgoCfgType
 from .conversion import set_quantizer_attribute
 from .mode import QuantizeModeRegistry, get_modelike_from_algo_cfg
@@ -231,6 +231,12 @@ def forward_loop(model) -> None:
     return calibrate(model, config["algorithm"], forward_loop=forward_loop)
 
 
+# TODO: create a config interface for auto_quantize and expose setting
+# quant_grouping_rules and score_module_rules as part of the config.
+# This will allow users to customize the grouping and scoring rules for their models.
+# This way wecan limit the granularity of quantization search. For example,
+#  - limit the quantization format search to decoder block level (instead of each linear layer level)
+#  - Same format for all self attention layers of a model etc.
 def auto_quantize(
     model: nn.Module,
     constraints: dict[str, float | str] = {"effective_bits": 4.8},
@@ -246,11 +252,23 @@ def auto_quantize(
     num_calib_steps: int = 512,
     num_score_steps: int = 128,
     verbose: bool = False,
+    method: str = "gradient",
+    checkpoint: str | None = None,
 ):
     r"""Perform optimal per-layer quantization by searching for the best quantization formats per-layer.
 
-    ``auto_quantize`` uses a gradient based sensitivity score to rank the per-layer quantization formats and search
-    for the best quantization formats per-layer.
+    ``auto_quantize`` uses sensitivity scores to rank the per-layer quantization formats and search
+    for the best quantization formats per-layer. The sensitivity score can be computed using gradient-based
+    methods (default) or KL divergence loss, controlled by the ``method`` parameter.
+
+    Internally this API runs two main phases:
+
+    #. Calibrate the quantized model exactly like :func:`quantize` would.
+    #. Estimate per-layer sensitivity scores to decide which format to keep.
+
+    The sensitivity scoring phase typically dominates the runtime of ``auto_quantize``, so decreasing the number of
+    samples used for scoring (see ``num_score_steps``) is the recommended way for improving overall auto_quantize time
+    with minimal accuracy impact.
 
     Args:
         model: A pytorch model with quantizer modules.
@@ -369,10 +387,20 @@ def forward_backward_step(model, batch) -> None:
                 disabled_layers = "*lm_head*"
                 disabled_layers = ["*lm_head*", "*mlp*"]
 
-        num_calib_steps: Number of batches to use for calibrating the quantized model. Suggested value is 512.
+        num_calib_steps: Number of batches to use for calibrating each candidate quantization format. Suggested value
+            is 512.
         num_score_steps: Number of batches to use for estimating ``auto_quantize`` scores. Suggested value is 128.
-            A higher value could increase the time taken for performing ``auto_quantize``.
+            A higher value could increase the time taken for performing ``auto_quantize``; reducing it speeds up the
+            sensitivity score estimation phase and typically affects accuracy less than lowering ``num_calib_steps``.
         verbose: If True, prints the search progress/intermediate results.
+        method: Method to use for estimating sensitivity loss. Higher loss indicates greater sensitivity
+            to quantization. Options are ``"gradient"`` (default; uses gradient-based loss estimation,
+            linear programming search, and requires ``loss_func`` or ``forward_backward_step``) and
+            ``"kl_div"`` (uses KL divergence between unquantized and quantized outputs, relies on
+            threshold-based binary search, and only requires ``forward_step`` returning logits).
+        checkpoint: (Optional) Path to checkpoint file for saving/restoring auto_quantize search state.
+            If the checkpoint file exists, the search state will be restored from it, skipping the
+            expensive score estimation step.
 
     Returns: A tuple (model, state_dict) where ``model`` is the searched and quantized model and
         ``state_dict`` contains the history and detailed stats of the search procedure.
@@ -384,23 +412,32 @@ def forward_backward_step(model, batch) -> None:
         This is to ensure compatibility with TensorRT-LLM which fuses these three linear layers into a single linear
         layer.
 
-        A list of regex pattern rules as defined in :attr:`rules <.algorithms.AutoQuantizeSearcher.rules>`
-        are used to specify the group of layers. The first captured group
-        in the regex pattern (i.e, ``pattern.match(name).group(1)``) is used to group the layers. All the layers
-        that share the same first captured group will have the same quantization format..
+        Grouping rules are defined in :attr:`quant_grouping_rules
+        <.algorithms.AutoQuantizeSearcher.quant_grouping_rules>`.
+        Each rule can be either a regex pattern or a callable function.
+
+        - **Regex patterns**: The first captured group (e.g.,
+          ``pattern.match(name).group(1)``) determines the group key.
+          Layers with the same group key share the same quantization format.
+        - **Functions**: Should take a module name and return a group key
+          (or ``None`` if the rule doesn't apply).
 
-        For example, the rule ``r"^(.*?)\.(q_proj|k_proj|v_proj)$"``
-        groups the `q_proj`, `k_proj`, `v_proj` linear layers belonging to the same transformer layer.
+        Example regex rule: ``r"^(.*?)\.(q_proj|k_proj|v_proj)$"`` groups the
+        `q_proj`, `k_proj`, `v_proj` layers belonging to the same transformer layer.
 
-        You may modify the rules to group the layers as per your requirement.
+        You can customize the rules as needed:
 
         .. code-block:: python
 
             from modelopt.torch.quantization.algorithms import AutoQuantizeSearcher
 
-            # To additionally group the layers belonging to same `mlp` layer,
-            # add the following rule
-            AutoQuantizeSearcher.rules.append(r"^(.*?)\.mlp")
+            # Add a regex rule to group layers in the same `mlp` module
+            AutoQuantizeSearcher.quant_grouping_rules.append(r"^(.*?)\.mlp")
+
+            # Or add a function rule for custom logic
+            AutoQuantizeSearcher.quant_grouping_rules.append(
+                lambda name: name.rsplit(".", 1)[0] if "expert" in name else None
+            )
 
             # Perform `auto_quantize`
             model, state_dict = auto_quantize(model, ...)
@@ -426,12 +463,20 @@ def forward_backward_step(model, batch) -> None:
         processed_quantization_formats.append((quant_cfg, name))
 
     assert len(processed_quantization_formats) > 0, "`quantization_formats` should not be empty"
+
+    # Select the appropriate searcher based on method
+    if method == "gradient":
+        searcher = AutoQuantizeGradientSearcher()
+    elif method == "kl_div":
+        searcher = AutoQuantizeKLDivSearcher()
+    else:
+        raise ValueError(f"Invalid method: {method}. Valid options are 'gradient' or 'kl_div'.")
+
     model = apply_mode(
         model,
         mode="auto_quantize",
         registry=QuantizeModeRegistry,
     )
-    searcher = AutoQuantizeSearcher()
     search_config = {
         "quantization_formats": processed_quantization_formats,
         "data_loader": data_loader,
@@ -442,6 +487,7 @@ def forward_backward_step(model, batch) -> None:
         "num_score_steps": num_score_steps,
         "disabled_layers": disabled_layers,
         "verbose": verbose,
+        "checkpoint": checkpoint,
     }
     # Disable all quantizers; AutoQuantize will enable the needed ones
     set_quantizer_by_cfg(model, {"*": {"enable": False}})
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index af46b6d26..31ac2bbbd 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -35,7 +35,7 @@
 from modelopt.torch.opt.dynamic import DynamicModule
 from modelopt.torch.utils.distributed import ParallelState
 
-from ..algorithms import AutoQuantizeSearcher
+from ..algorithms import AutoQuantizeGradientSearcher
 from ..conversion import register
 from ..nn import QuantInputBase, QuantModule, QuantModuleRegistry, TensorQuantizer
 from ..nn.modules.quant_linear import _QuantLinear
@@ -745,7 +745,7 @@ def _is_param_grad_enabled_for_auto_quantize(pname, model):
     return "embed" in pname
 
 
-AutoQuantizeSearcher.register_custom_support(
+AutoQuantizeGradientSearcher.register_custom_support(
     _is_supported_hf_model,
     setup_model_for_gradient_checkpointing,
     _is_param_grad_enabled_for_auto_quantize,
diff --git a/tests/gpu/torch/export/test_export_weight.py b/tests/gpu/torch/export/test_export_weight_gpu.py
similarity index 100%
rename from tests/gpu/torch/export/test_export_weight.py
rename to tests/gpu/torch/export/test_export_weight_gpu.py
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
index ab59e663e..a3e72ffa7 100644
--- a/tests/unit/torch/quantization/plugins/test_huggingface.py
+++ b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -15,6 +15,7 @@
 
 import os
 import warnings
+from contextlib import nullcontext
 
 import pytest
 import torch
@@ -136,28 +137,43 @@ def test_dbrx():
     assert torch.allclose(out_1[0], out_2[0])
 
 
-def test_autoquantize_huggingface():
+@pytest.mark.parametrize(
+    "method",
+    ["gradient", "kl_div"],
+)
+def test_autoquantize_huggingface(method):
     model = get_tiny_llama()
     input_ids = model.dummy_inputs["input_ids"]
 
+    def forward_step(model, batch):
+        return model(**batch) if method == "gradient" else model(**batch).logits
+
     warnings.filterwarnings(
         "error", message="AutoQuantize: Error enabling gradient checkpointing for huggingface model"
     )
 
-    with pytest.warns(
-        UserWarning,
-        match="AutoQuantize: Huggingface model detected - Enabling gradient checkpointing. ",
-    ):
+    # Gradient checkpointing warning should only appear for gradient-based method
+    context = (
+        pytest.warns(
+            UserWarning,
+            match="AutoQuantize: Huggingface model detected - Enabling gradient checkpointing. ",
+        )
+        if method == "gradient"
+        else nullcontext()
+    )
+
+    with context:
         best_model, search_history = mtq.auto_quantize(
             model,
             constraints={"effective_bits": 11.0},
             quantization_formats=[mtq.INT8_DEFAULT_CFG],
             data_loader=[{"input_ids": input_ids, "labels": input_ids} for _ in range(2)],
-            forward_step=lambda model, batch: model(**batch),
+            forward_step=forward_step,
             loss_func=lambda output, data: output.loss,
             num_calib_steps=2,
             num_score_steps=2,
             verbose=True,
+            method=method,
         )
 
 
diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py
index b08d7fa5e..1a5cfee32 100644
--- a/tests/unit/torch/quantization/test_autoquant.py
+++ b/tests/unit/torch/quantization/test_autoquant.py
@@ -92,7 +92,7 @@ def test_quant_recipe_hparam():
     ]
     hparam = QuantRecipeHparam(
         search_recipes,
-        nn_modules=[model_test],
+        quant_modules=[model_test],
     )
     model_test._register_hparam("quant_recipe", hparam)
     assert model_test.quant_recipe == QuantRecipe(mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG)
@@ -133,7 +133,11 @@ def test_quant_recipe_hparam():
         ([None, mtq.INT8_SMOOTHQUANT_CFG], 8.0, 11.0),
     ],
 )
-def test_auto_quantize(model_cls, search_formats, min_bits, search_bits):
+@pytest.mark.parametrize(
+    "method",
+    ["gradient", "kl_div"],
+)
+def test_auto_quantize(model_cls, search_formats, min_bits, search_bits, method):
     model = model_cls()
 
     def loss_func(output):
@@ -149,6 +153,7 @@ def loss_func(output):
         num_calib_steps=2,
         num_score_steps=2,
         verbose=True,
+        method=method,
     )
     assert isinstance(search_history, dict)
     assert search_history["best"]["is_satisfied"]
@@ -178,7 +183,7 @@ def loss_func(output):
     assert torch.allclose(output_ref, output_test)
 
 
-def test_auto_quantize_disable():
+def test_auto_quantize_disable_layers():
     model = TransformerBlock()
 
     def loss_func(output):
@@ -337,3 +342,58 @@ def test_estimate_quant_compression():
 
     fp8_affine_kv_cfg = mtq.config.QuantizeConfig(**mtq.FP8_AFFINE_KV_CFG)
     assert estimate_quant_compression(fp8_affine_kv_cfg) == 0.5
+
+
+@pytest.mark.parametrize("method", ["gradient", "kl_div"])
+def test_auto_quantize_checkpoint_resume(method, tmp_path, capsys):
+    """Test that checkpoint can be used to resume an interrupted search."""
+    model = SimpleLinear()
+    checkpoint_path = str(tmp_path / "autoquant_resume_checkpoint.pth")
+
+    # First run: save checkpoint
+    model_1, state_dict_1 = mtq.auto_quantize(
+        model,
+        constraints={"effective_bits": 6.0},
+        quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG],
+        data_loader=[model.get_input() for _ in range(2)],
+        forward_step=lambda model, batch: model(batch),
+        loss_func=lambda output, data: output.sum(),
+        num_calib_steps=2,
+        num_score_steps=2,
+        verbose=True,
+        method=method,
+        checkpoint=checkpoint_path,
+    )
+
+    # Clear captured output from first run
+    capsys.readouterr()
+
+    # Second run: resume with same constraint should produce same results
+    model_2 = SimpleLinear()
+    model_2, state_dict_2 = mtq.auto_quantize(
+        model_2,
+        constraints={"effective_bits": 6.0},  # Same constraint
+        quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG],
+        data_loader=[model_2.get_input() for _ in range(2)],
+        forward_step=lambda model, batch: model(batch),
+        loss_func=lambda output, data: output.sum(),
+        num_calib_steps=2,
+        num_score_steps=2,
+        verbose=True,
+        method=method,
+        checkpoint=checkpoint_path,
+    )
+
+    # Verify the restore message was printed on second run
+    captured = capsys.readouterr()
+    assert "Restored from checkpoint, skipping scoring" in captured.out, (
+        "Expected restore message when resuming from checkpoint"
+    )
+
+    # Results should be identical when using same constraint
+    assert state_dict_1["candidate_stats"] == state_dict_2["candidate_stats"]
+    assert state_dict_1["best"]["recipe"] == state_dict_2["best"]["recipe"]
+    assert (
+        pytest.approx(state_dict_1["best"]["constraints"]["effective_bits"])
+        == state_dict_2["best"]["constraints"]["effective_bits"]
+    )

From 671bbbb69be999e8e32d20eb2c68a9d5f04d94f1 Mon Sep 17 00:00:00 2001
From: inisis <desmond.yao@buaa.edu.cn>
Date: Wed, 26 Nov 2025 14:24:37 +0800
Subject: [PATCH 2/8] feat: add onnxslim support

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 CHANGELOG.rst                          |  1 +
 modelopt/onnx/quantization/quantize.py | 13 +++----------
 setup.py                               |  2 +-
 tests/gpu/onnx/test_simplify.py        |  8 ++++----
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index beb01abf0..60c7ec5ca 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -28,6 +28,7 @@ Model Optimizer Changelog (Linux)
 **Misc**
 
 - Bump minimum recommended transformers version to 4.53.
+- Replace ONNX simplification package from 'onnxsim' to 'onnxslim'.
 
 0.39 (2025-11-11)
 ^^^^^^^^^^^^^^^^^
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 800124646..00b4f5d75 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -39,6 +39,7 @@
 from typing import Any
 
 import onnx
+import onnxslim
 import onnx.onnx_cpp2py_export.checker as C
 import onnx_graphsurgeon as gs
 
@@ -133,16 +134,8 @@ def _preprocess_onnx(
     if simplify:
         logger.info("Attempting to simplify model")
         try:
-            import onnxsim
-        except ModuleNotFoundError as e:
-            logger.warning(
-                "onnxsim is not installed. Please install it with 'pip install onnxsim'."
-            )
-            raise e
-
-        try:
-            model_simp, check = onnxsim.simplify(onnx_model)
-            if check:
+            model_simp = onnxslim.slim(onnx_model, skip_fusion_patterns=["FusionGemm"])
+            if model_simp:
                 onnx_model = model_simp
                 onnx_path = os.path.join(output_dir, f"{model_name}_simp.onnx")
                 save_onnx(onnx_model, onnx_path, use_external_data_format)
diff --git a/setup.py b/setup.py
index 85b79e729..7befe9a47 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
         "onnxruntime-gpu~=1.22.0 ; platform_machine != 'aarch64' and platform_system != 'Darwin' and platform_system != 'Windows'",  # noqa: E501
         "onnxruntime-directml==1.20.0; platform_system == 'Windows'",
         "onnxscript",  # For autocast opset conversion and test_onnx_dynamo_export unit test
-        "onnxsim ; python_version < '3.12' and platform_machine != 'aarch64'",
+        "onnxslim>=0.1.75,
         "polygraphy>=0.49.22",
     ],
     "hf": [
diff --git a/tests/gpu/onnx/test_simplify.py b/tests/gpu/onnx/test_simplify.py
index 3b6acccb6..5ca8449b3 100644
--- a/tests/gpu/onnx/test_simplify.py
+++ b/tests/gpu/onnx/test_simplify.py
@@ -57,14 +57,14 @@ def test_onnx_simplification(tmp_path):
         assert os.path.isfile(output_onnx_path), "Quantized ONNX was not found!"
 
         # Load the simplified model and check that the model doesn't contain Identity nodes,
-        #   only 3 layers (Conv->BN->Relu).
+        #   only 2 layers (Conv->Relu).
         graph = gs.import_onnx(onnx.load(simplified_onnx_path))
         identity_nodes = [n for n in graph.nodes if n.op == "Identity"]
         assert not identity_nodes, "Simplified ONNX model contains Identity nodes but it shouldn't."
-        assert len(graph.nodes) == 3, (
-            f"Number of nodes doesn't match the expected: {len(graph.nodes)} vs 3."
+        assert len(graph.nodes) == 2, (
+            f"Number of nodes doesn't match the expected: {len(graph.nodes)} vs 2."
         )
-        assert all(n.op in ["Conv", "BatchNormalization", "Relu"] for n in graph.nodes), (
+        assert all(n.op in ["Conv", "Relu"] for n in graph.nodes), (
             "Graph contains more ops than expected."
         )
 

From 0bf5b06759fdb3d65d568db16f446ca02e97198e Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:23:53 +0530
Subject: [PATCH 3/8] Update CHANGELOG.rst

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 60c7ec5ca..f6a323441 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -28,7 +28,7 @@ Model Optimizer Changelog (Linux)
 **Misc**
 
 - Bump minimum recommended transformers version to 4.53.
-- Replace ONNX simplification package from 'onnxsim' to 'onnxslim'.
+- Replace ONNX simplification package from ``onnxsim`` to ``onnxslim``.
 
 0.39 (2025-11-11)
 ^^^^^^^^^^^^^^^^^

From ff3ffc83aca933a0d207d420e0f13bbebb7316b1 Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:25:25 +0530
Subject: [PATCH 4/8] Update tox.ini

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 tox.ini | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tox.ini b/tox.ini
index ff73b3b2b..1c1afbe3a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -18,9 +18,6 @@ deps =
     torch28: torchvision~=0.23.0
     torch29: torchvision~=0.24.0
 
-    # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
-    py312: onnxsim
-
     # Install megatron-core for special unit tests
     megatron-core
 
@@ -42,9 +39,6 @@ deps =
     # Make sure torch 2.9 is used
     torchvision~=0.24.0
 
-    # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
-    py312: onnxsim
-
     # ONNX unit tests heavily rely on torch / torchvision
     onnx: .[onnx,dev-test]
     onnx: torchvision
@@ -80,9 +74,6 @@ commands_pre =
     # Install Eagle-3 test dependencies
     pip install tiktoken blobfile sentencepiece
 
-    # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
-    py312: pip install onnxsim
-
     # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
     #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]

From 0e07446fc3c29cf2e29e68d0bf413be6ad1f449b Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:26:10 +0530
Subject: [PATCH 5/8] Update tests.yml

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 .gitlab/tests.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml
index 1012bcb7f..1d699896e 100644
--- a/.gitlab/tests.yml
+++ b/.gitlab/tests.yml
@@ -19,8 +19,6 @@ unit:
     TRANSFORMERS: latest
   image: python:3.$PYTHON
   before_script:
-    # Install cmake to build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
-    - if [ "$PYTHON" = "12" ]; then apt-get update && apt-get install -y cmake; fi
     - pip install tox
   script:
     - tox -e py3$PYTHON-torch$TORCH-tf_$TRANSFORMERS-unit

From ba5a125433cff5a2bbecf1042e6fbc54f7f17534 Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:30:13 +0530
Subject: [PATCH 6/8] Update setup.py

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7befe9a47..11bf5b261 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
         "onnxruntime-gpu~=1.22.0 ; platform_machine != 'aarch64' and platform_system != 'Darwin' and platform_system != 'Windows'",  # noqa: E501
         "onnxruntime-directml==1.20.0; platform_system == 'Windows'",
         "onnxscript",  # For autocast opset conversion and test_onnx_dynamo_export unit test
-        "onnxslim>=0.1.75,
+        "onnxslim>=0.1.75",
         "polygraphy>=0.49.22",
     ],
     "hf": [

From f2d3412c8522e3dc03fcbdcc78f63e6b731977d4 Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:46:33 +0530
Subject: [PATCH 7/8] Update quantize.py

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 modelopt/onnx/quantization/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 00b4f5d75..96ee406c7 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -39,9 +39,9 @@
 from typing import Any
 
 import onnx
-import onnxslim
 import onnx.onnx_cpp2py_export.checker as C
 import onnx_graphsurgeon as gs
+import onnxslim
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import is_data_dependent_shape_op

From 18c27dd56fa8869322c3a95a8adc5c00306a5e25 Mon Sep 17 00:00:00 2001
From: inisis <desmond.yao@buaa.edu.cn>
Date: Wed, 26 Nov 2025 21:04:55 +0800
Subject: [PATCH 8/8] pin minimal onnxslim version to 0.1.76

Signed-off-by: inisis <desmond.yao@buaa.edu.cn>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 11bf5b261..158c91e40 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
         "onnxruntime-gpu~=1.22.0 ; platform_machine != 'aarch64' and platform_system != 'Darwin' and platform_system != 'Windows'",  # noqa: E501
         "onnxruntime-directml==1.20.0; platform_system == 'Windows'",
         "onnxscript",  # For autocast opset conversion and test_onnx_dynamo_export unit test
-        "onnxslim>=0.1.75",
+        "onnxslim>=0.1.76",
         "polygraphy>=0.49.22",
     ],
     "hf": [