vllm-project · yiliu30 · Nov 21, 2025 · Nov 21, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/setup.py b/setup.py
@@ -144,8 +144,7 @@ def localversion_func(version: ScmVersion) -> str:
             if BUILD_TYPE == "release"
             else "compressed-tensors>=0.12.3a2"
         ),
-        # TODO: replace it with the release version
-        ("auto_round @ git+https://github.com/intel/auto-round.git@llmc"),
+        ("auto-round==0.9.1"),
     ],
     extras_require={
         "dev": [

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
@@ -107,6 +107,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     # AutoRound modifier arguments
     iters: int = 200
     enable_torch_compile: bool = True
+    batch_size: int = 8
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
@@ -223,6 +224,7 @@ def apply_autoround(self, state, subgraph):
                 scheme=ar_quant_scheme,
                 iters=self.iters,
                 enable_torch_compile=self.enable_torch_compile,
+                batch_size=self.batch_size,
             )
             # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config(enable_gguf_official_mixed=False)
@@ -236,7 +238,7 @@ def apply_autoround(self, state, subgraph):
                 block=decoding_layer,
                 inputs=cur_inputs,
                 q_input=self._q_input,
-                device=device,
+                device=str(device),
                 # Leave offload for LLMC
                 auto_offload=False,
             )