Skip to content

Commit f6aa122

Browse files
authored
[CI Sprint] Quantization CI Cleanup (vllm-project#24130)
Signed-off-by: Alex Yun <[email protected]>
1 parent 184b12f commit f6aa122

File tree

10 files changed

+32
-26
lines changed

10 files changed

+32
-26
lines changed

tests/quantization/test_compressed_tensors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def zp_valid(zp: torch.Tensor | None):
141141
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
142142
],
143143
)
144-
@pytest.mark.parametrize("max_tokens", [8])
144+
@pytest.mark.parametrize("max_tokens", [4])
145145
@pytest.mark.parametrize("num_logprobs", [10])
146146
@pytest.mark.parametrize(
147147
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
182182
example_prompts, max_tokens, num_logprobs
183183
)
184184

185-
with vllm_runner(model_path, dtype=dtype) as vllm_model:
185+
with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
186186
vllm_outputs = vllm_model.generate_greedy_logprobs(
187187
example_prompts, max_tokens, num_logprobs
188188
)

tests/quantization/test_cpu_offload.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
1919
# Test loading a quantized checkpoint
2020
compare_two_settings(
2121
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
22-
[],
23-
["--cpu-offload-gb", "1"],
22+
["--enforce_eager"],
23+
["--enforce_eager", "--cpu-offload-gb", "1"],
2424
max_wait_seconds=480,
2525
)
2626

@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
3535
# Test GPTQ Marlin
3636
compare_two_settings(
3737
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
38-
[],
39-
["--cpu-offload-gb", "1"],
38+
["--enforce_eager"],
39+
["--enforce_eager", "--cpu-offload-gb", "1"],
4040
max_wait_seconds=480,
4141
)
4242

@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
5151
# Test AWQ Marlin
5252
compare_two_settings(
5353
"Qwen/Qwen2-1.5B-Instruct-AWQ",
54-
[],
55-
["--cpu-offload-gb", "1"],
54+
["--enforce_eager"],
55+
["--enforce_eager", "--cpu-offload-gb", "1"],
5656
max_wait_seconds=480,
5757
)
5858

@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
6767
# Test wNa16
6868
compare_two_settings(
6969
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
70-
[],
71-
["--cpu-offload-gb", "1"],
70+
["--enforce_eager"],
71+
["--enforce_eager", "--cpu-offload-gb", "1"],
7272
max_wait_seconds=480,
7373
)

tests/quantization/test_experts_int8.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
)
2222
@pytest.mark.parametrize("model", MODELS)
2323
@pytest.mark.parametrize("dtype", ["bfloat16"])
24-
@pytest.mark.parametrize("max_tokens", [10])
24+
@pytest.mark.parametrize("max_tokens", [4])
2525
def test_model_experts_int8_startup(
2626
hf_runner,
2727
vllm_runner,
@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
3333
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
3434
model_info.check_transformers_version(on_fail="skip")
3535

36-
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
36+
with vllm_runner(
37+
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
38+
) as vllm_model:
3739
vllm_model.generate_greedy(example_prompts, max_tokens)

tests/quantization/test_fp8.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ def test_model_load_and_run(
4545
if force_marlin:
4646
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
4747

48-
with vllm_runner(model_id) as llm:
48+
with vllm_runner(model_id, enforce_eager=True) as llm:
4949
# note: this does not test accuracy, just that we can run through
5050
# see lm-eval tests for accuracy
51-
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
51+
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
5252
print(outputs[0][1])
5353

5454

@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
8585

8686
# `LLM.apply_model` requires pickling a function.
8787
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
88-
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
88+
with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
8989

9090
def check_model(model):
9191
attn = model.model.layers[0].self_attn.attn
@@ -112,7 +112,7 @@ def check_model(model):
112112

113113
# note: this does not test accuracy, just that we can run through
114114
# see lm-eval tests for accuracy
115-
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
115+
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
116116
print(outputs[0][1])
117117

118118

@@ -142,7 +142,10 @@ def test_load_fp16_model(
142142
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
143143

144144
with vllm_runner(
145-
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
145+
"facebook/opt-125m",
146+
quantization="fp8",
147+
enforce_eager=True,
148+
kv_cache_dtype=kv_cache_dtype,
146149
) as llm:
147150

148151
def check_model(model):

tests/quantization/test_ipex_quant.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
@pytest.mark.parametrize("model", MODELS)
2727
@pytest.mark.parametrize("dtype", DTYPE)
2828
def test_ipex_quant(vllm_runner, model, dtype):
29-
with vllm_runner(model, dtype=dtype) as llm:
30-
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
29+
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
30+
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
3131
assert output
3232
print(output)

tests/quantization/test_lm_head.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,4 @@ def check_model(model):
4949

5050
vllm_model.apply_model(check_model)
5151

52-
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
52+
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])

tests/quantization/test_modelopt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,6 @@ def check_model(model):
8888
llm.apply_model(check_model)
8989

9090
# Run a simple generation test to ensure the model works
91-
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
91+
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
9292
assert output
9393
print(f"ModelOpt FP8 output: {output}")

tests/quantization/test_ptpc_fp8.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
3838
"facebook/opt-125m",
3939
dtype=dtype,
4040
quantization="ptpc_fp8",
41+
enforce_eager=True,
4142
kv_cache_dtype=kv_cache_dtype,
4243
)
4344
except AssertionError as e:
@@ -65,5 +66,5 @@ def check_model(model):
6566

6667
llm.apply_model(check_model)
6768

68-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
69+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
6970
assert output

tests/quantization/test_register_quantization_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
get_quantization_config,
2424
register_quantization_config,
2525
)
26-
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
27-
QuantizationConfig,
26+
from vllm.model_executor.layers.quantization.base_config import (
27+
QuantizationConfig, # noqa: E501
2828
)
2929

3030

@@ -142,5 +142,5 @@ def check_model(model):
142142

143143
llm.apply_model(check_model)
144144

145-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
145+
output = llm.generate_greedy("Hello my name is", max_tokens=1)
146146
assert output

tests/quantization/test_torchao.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ def get_weight_attrs(model):
392392
assert not has_int4_preshuffled_tensor
393393

394394
assert weight_attrs == [False, 1, 0, True]
395-
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
395+
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
396396

397397
assert output
398398

0 commit comments

Comments
 (0)