[CI Sprint] Quantization CI Cleanup (vllm-project#24130)

killershrimp · web-flow · commit f6aa12269879 · 2025-11-18T09:21:48.000-05:00
Signed-off-by: Alex Yun &lt;alexyun04@gmail.com&gt;
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -141,7 +141,7 @@ def zp_valid(zp: torch.Tensor | None):
         "neuralmagic/Llama-3.2-1B-quantized.w8a8",
     ],
 )
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
     "use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
 
-    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+    with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
     # Test loading a quantized checkpoint
     compare_two_settings(
         "neuralmagic/Qwen2-1.5B-Instruct-FP8",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
     # Test GPTQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
     # Test AWQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-AWQ",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
     # Test wNa16
     compare_two_settings(
         "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
@@ -21,7 +21,7 @@
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [4])
 def test_model_experts_int8_startup(
     hf_runner,
     vllm_runner,
@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+    ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
@@ -45,10 +45,10 @@ def test_model_load_and_run(
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
-    with vllm_runner(model_id) as llm:
+    with vllm_runner(model_id, enforce_eager=True) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
 
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+    with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
 
         def check_model(model):
             attn = model.model.layers[0].self_attn.attn
@@ -112,7 +112,7 @@ def check_model(model):
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -142,7 +142,10 @@ def test_load_fp16_model(
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
     with vllm_runner(
-        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+        "facebook/opt-125m",
+        quantization="fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
     ) as llm:
 
         def check_model(model):
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
@@ -26,7 +26,7 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
     assert output
     print(output)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
@@ -49,4 +49,4 @@ def check_model(model):
 
         vllm_model.apply_model(check_model)
 
-        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
+        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
@@ -88,6 +88,6 @@ def check_model(model):
         llm.apply_model(check_model)
 
         # Run a simple generation test to ensure the model works
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             "facebook/opt-125m",
             dtype=dtype,
             quantization="ptpc_fp8",
+            enforce_eager=True,
             kv_cache_dtype=kv_cache_dtype,
         )
     except AssertionError as e:
@@ -65,5 +66,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
@@ -23,8 +23,8 @@
     get_quantization_config,
     register_quantization_config,
 )
-from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig,
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,  # noqa: E501
 )
 
 
@@ -142,5 +142,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=1)
         assert output
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
@@ -392,7 +392,7 @@ def get_weight_attrs(model):
             assert not has_int4_preshuffled_tensor
 
         assert weight_attrs == [False, 1, 0, True]
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
 

Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,7 @@ def zp_valid(zp: torch.Tensor \| None):`
`141`	`141`	`"neuralmagic/Llama-3.2-1B-quantized.w8a8",`
`142`	`142`	`],`
`143`	`143`	`)`
`144`		`-@pytest.mark.parametrize("max_tokens", [8])`
	`144`	`+@pytest.mark.parametrize("max_tokens", [4])`
`145`	`145`	`@pytest.mark.parametrize("num_logprobs", [10])`
`146`	`146`	`@pytest.mark.parametrize(`
`147`	`147`	`"use_aiter", [True, False] if current_platform.is_rocm() else [False]`
`@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(`
`182`	`182`	`example_prompts, max_tokens, num_logprobs`
`183`	`183`	`)`
`184`	`184`
`185`		`- with vllm_runner(model_path, dtype=dtype) as vllm_model:`
	`185`	`+ with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:`
`186`	`186`	`vllm_outputs = vllm_model.generate_greedy_logprobs(`
`187`	`187`	`example_prompts, max_tokens, num_logprobs`
`188`	`188`	`)`
Original file line number	Diff line number	Diff line change
`@@ -49,4 +49,4 @@ def check_model(model):`
`49`	`49`
`50`	`50`	`vllm_model.apply_model(check_model)`
`51`	`51`
`52`		`- print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])`
	`52`	`+ print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])`
Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,8 @@`
`23`	`23`	`get_quantization_config,`
`24`	`24`	`register_quantization_config,`
`25`	`25`	`)`
`26`		`-from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501`
`27`		`- QuantizationConfig,`
	`26`	`+from vllm.model_executor.layers.quantization.base_config import (`
	`27`	`+ QuantizationConfig, # noqa: E501`
`28`	`28`	`)`
`29`	`29`
`30`	`30`
`@@ -142,5 +142,5 @@ def check_model(model):`
`142`	`142`
`143`	`143`	`llm.apply_model(check_model)`
`144`	`144`
`145`		`- output = llm.generate_greedy("Hello my name is", max_tokens=20)`
	`145`	`+ output = llm.generate_greedy("Hello my name is", max_tokens=1)`
`146`	`146`	`assert output`