File tree Expand file tree Collapse file tree 6 files changed +11
-8
lines changed
examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round Expand file tree Collapse file tree 6 files changed +11
-8
lines changed Original file line number Diff line number Diff line change @@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
44``` bash
55pip install neural-compressor-pt==3.7
66# auto-round
7- pip install auto-round==0.9.2
7+ pip install auto-round==0.9.3
88# vLLM
99git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
1010VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
@@ -16,7 +16,7 @@ pip uninstall flash_attn
1616### Quantize Model
1717- Export model path
1818``` bash
19- export MODEL=deepseek-ai /DeepSeek-R1
19+ export MODEL=unsloth /DeepSeek-R1-BF1
2020```
2121
2222- MXFP8
Original file line number Diff line number Diff line change @@ -40,11 +40,12 @@ def get_model_and_tokenizer(model_name):
4040 fp32_model = AutoModelForCausalLM .from_pretrained (
4141 model_name ,
4242 device_map = "cpu" ,
43- trust_remote_code = True ,
43+ trust_remote_code = False ,
44+ dtype = "auto" ,
4445 )
4546 tokenizer = AutoTokenizer .from_pretrained (
4647 model_name ,
47- trust_remote_code = True ,
48+ trust_remote_code = False ,
4849 )
4950 return fp32_model , tokenizer
5051
@@ -68,6 +69,7 @@ def quant_model(args):
6869 fp_layers = config ["fp_layers" ],
6970 export_format = export_format ,
7071 output_dir = output_dir ,
72+ reloading = False ,
7173 )
7274
7375 # quantizer execute
Original file line number Diff line number Diff line change @@ -114,7 +114,6 @@ lm_eval --model vllm \
114114 --tasks $TASK_NAME \
115115 --batch_size $BATCH_SIZE \
116116 --log_samples \
117- --limit 64 \
118117 --seed 42 \
119118 --output_path ${OUTPUT_DIR} \
120119 --show_config 2>&1 | tee ${OUTPUT_DIR} /log.txt
Original file line number Diff line number Diff line change @@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
44``` bash
55pip install neural-compressor-pt==3.7
66# auto-round
7- pip install auto-round==0.9.2
7+ pip install auto-round==0.9.3
88# vLLM
99git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
1010VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
Original file line number Diff line number Diff line change @@ -62,11 +62,14 @@ def quant_model(args):
6262 quant_config = AutoRoundConfig (
6363 tokenizer = tokenizer ,
6464 scheme = config ["scheme" ],
65- enable_torch_compile = args . enable_torch_compile ,
65+ enable_torch_compile = True ,
6666 iters = config ["iters" ],
6767 fp_layers = config ["fp_layers" ],
6868 export_format = export_format ,
69+ disable_opt_rtn = True ,
70+ low_gpu_mem_usage = True ,
6971 output_dir = output_dir ,
72+ reloading = False ,
7073 )
7174
7275 # quantizer execute
Original file line number Diff line number Diff line change @@ -114,7 +114,6 @@ lm_eval --model vllm \
114114 --tasks $TASK_NAME \
115115 --batch_size $BATCH_SIZE \
116116 --log_samples \
117- --limit 64 \
118117 --seed 42 \
119118 --output_path ${OUTPUT_DIR} \
120119 --show_config 2>&1 | tee ${OUTPUT_DIR} /log.txt
You can’t perform that action at this time.
0 commit comments