diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 1b3a01172ee..c90a77b1ad2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -88,6 +88,8 @@ Notes: ### Llama3 Quantization Recipes +Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%. + #### Llama 3.1 8B MXFP8 AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default. @@ -131,6 +133,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8) `Target_bits=5.8` is an empirical value. @@ -147,6 +151,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.1 70B NVFP4 RTN (Round-to-Nearest) is enough to keep accuracy. @@ -155,6 +161,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.1 70B uNVFP4 RTN (Round-to-Nearest) is enough to keep accuracy. @@ -186,27 +194,27 @@ For convenience, we provide a benchmark script that automatically handles GPU de 1. **Llama 3.1 8B MXFP8** (1 GPU): ```bash -CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 --gpu_memory_utilization=0.8 ``` 2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU): ```bash -CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 --gpu_memory_utilization=0.6 ``` -3. **Llama 3.3 70B MXFP8** (4 GPU): +3. **Llama 3.3 70B MXFP8** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 --gpu_memory_utilization=0.8 ``` -4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU): +4. **Llama 3.3 70B MXFP4 Mixed** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 --gpu_memory_utilization=0.6 ``` -5. **Llama 3.1 70B MXFP8** (4 GPU): +5. **Llama 3.1 70B MXFP8** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 --gpu_memory_utilization=0.8 ``` The script automatically: diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 87b635be52f..02bbeaba8af 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -4,7 +4,8 @@ # Parse command line arguments TASKS="piqa,hellaswag,mmlu,gsm8k" -BATCH_SIZE=8 +BATCH_SIZE=64 +GPU_MEMORY_UTILIZATION=0.8 while [[ $# -gt 0 ]]; do case $1 in @@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do BATCH_SIZE="${1#*=}" shift ;; + --gpu_memory_utilization=*) + GPU_MEMORY_UTILIZATION="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -48,6 +53,7 @@ echo " Model Path: $MODEL_PATH" echo " Tasks: $TASKS" echo " Batch Size: $BATCH_SIZE" echo " Tensor Parallel Size: $TENSOR_PARALLEL_SIZE" +echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # Check if the model exists @@ -68,11 +74,11 @@ run_evaluation() { echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \ --tasks $tasks \ --batch_size $BATCH_SIZE diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index d50deaf6b3c..91e56f182e7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -95,13 +95,12 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.3 70B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --quant_lm_head \ --iters 0 \ --export_path "$OUTPUT_MODEL" ;; @@ -140,25 +139,23 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.1 70B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --quant_lm_head \ --iters 0 \ --export_path "$OUTPUT_MODEL" ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --quant_lm_head --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype NVFP4 \ - --quant_lm_head \ --iters 0 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL"