diff --git a/setup.py b/setup.py index 05e1ad0c1..79325409c 100644 --- a/setup.py +++ b/setup.py @@ -144,8 +144,7 @@ def localversion_func(version: ScmVersion) -> str: if BUILD_TYPE == "release" else "compressed-tensors>=0.12.3a2" ), - # TODO: replace it with the release version - ("auto_round @ git+https://github.com/intel/auto-round.git@llmc"), + ("auto-round==0.9.1"), ], extras_require={ "dev": [ diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 2480751a9..de2fa31e5 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -107,6 +107,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): # AutoRound modifier arguments iters: int = 200 enable_torch_compile: bool = True + batch_size: int = 8 # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) @@ -223,6 +224,7 @@ def apply_autoround(self, state, subgraph): scheme=ar_quant_scheme, iters=self.iters, enable_torch_compile=self.enable_torch_compile, + batch_size=self.batch_size, ) # TODO: configure layer-wise config based on self.resolved_config ar.configure_layer_config(enable_gguf_official_mixed=False) @@ -236,7 +238,7 @@ def apply_autoround(self, state, subgraph): block=decoding_layer, inputs=cur_inputs, q_input=self._q_input, - device=device, + device=str(device), # Leave offload for LLMC auto_offload=False, )