Fix DS/QWEN Example (#2354)

yiliu30 · web-flow · commit 5864c7a029e2 · 2025-12-12T19:58:08.000+08:00
Signed-off-by: yiliu30 &lt;yi4.liu@intel.com&gt;
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
@@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
 ```bash
 pip install neural-compressor-pt==3.7
 # auto-round
-pip install auto-round==0.9.2
+pip install auto-round==0.9.3
 # vLLM
 git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
 VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
@@ -16,7 +16,7 @@ pip uninstall flash_attn
 ### Quantize Model
 - Export model path
 ```bash
-export MODEL=deepseek-ai/DeepSeek-R1
+export MODEL=unsloth/DeepSeek-R1-BF1
 ```
 
 - MXFP8
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -40,11 +40,12 @@ def get_model_and_tokenizer(model_name):
     fp32_model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="cpu",
-        trust_remote_code=True,
+        trust_remote_code=False,
+        dtype="auto",
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
-        trust_remote_code=True,
+        trust_remote_code=False,
     )
     return fp32_model, tokenizer
 
@@ -68,6 +69,7 @@ def quant_model(args):
         fp_layers=config["fp_layers"],
         export_format=export_format,
         output_dir=output_dir,
+        reloading=False,
     )
 
     # quantizer execute
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -114,7 +114,6 @@ lm_eval --model vllm \
   --tasks $TASK_NAME \
   --batch_size $BATCH_SIZE \
   --log_samples \
-  --limit 64 \
   --seed 42 \
   --output_path ${OUTPUT_DIR} \
   --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md
@@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
 ```bash
 pip install neural-compressor-pt==3.7
 # auto-round
-pip install auto-round==0.9.2
+pip install auto-round==0.9.3
 # vLLM
 git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
 VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
@@ -62,11 +62,14 @@ def quant_model(args):
     quant_config = AutoRoundConfig(
         tokenizer=tokenizer,
         scheme=config["scheme"],
-        enable_torch_compile=args.enable_torch_compile,
+        enable_torch_compile=True,
         iters=config["iters"],
         fp_layers=config["fp_layers"],
         export_format=export_format,
+        disable_opt_rtn=True,
+        low_gpu_mem_usage=True,
         output_dir=output_dir,
+        reloading=False,
     )
 
     # quantizer execute
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -114,7 +114,6 @@ lm_eval --model vllm \
   --tasks $TASK_NAME \
   --batch_size $BATCH_SIZE \
   --log_samples \
-  --limit 64 \
   --seed 42 \
   --output_path ${OUTPUT_DIR} \
   --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt