@@ -45,10 +45,10 @@ def test_model_load_and_run(
4545 if force_marlin :
4646 monkeypatch .setenv ("VLLM_TEST_FORCE_FP8_MARLIN" , "1" )
4747
48- with vllm_runner (model_id ) as llm :
48+ with vllm_runner (model_id , enforce_eager = True ) as llm :
4949 # note: this does not test accuracy, just that we can run through
5050 # see lm-eval tests for accuracy
51- outputs = llm .generate_greedy (["Hello my name is" ], max_tokens = 10 )
51+ outputs = llm .generate_greedy (["Hello my name is" ], max_tokens = 4 )
5252 print (outputs [0 ][1 ])
5353
5454
@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
8585
8686 # `LLM.apply_model` requires pickling a function.
8787 monkeypatch .setenv ("VLLM_ALLOW_INSECURE_SERIALIZATION" , "1" )
88- with vllm_runner (model_id , kv_cache_dtype = "fp8" ) as llm :
88+ with vllm_runner (model_id , kv_cache_dtype = "fp8" , enforce_eager = True ) as llm :
8989
9090 def check_model (model ):
9191 attn = model .model .layers [0 ].self_attn .attn
@@ -112,7 +112,7 @@ def check_model(model):
112112
113113 # note: this does not test accuracy, just that we can run through
114114 # see lm-eval tests for accuracy
115- outputs = llm .generate_greedy (["Hello my name is" ], max_tokens = 10 )
115+ outputs = llm .generate_greedy (["Hello my name is" ], max_tokens = 4 )
116116 print (outputs [0 ][1 ])
117117
118118
@@ -142,7 +142,10 @@ def test_load_fp16_model(
142142 monkeypatch .setenv ("VLLM_TEST_FORCE_FP8_MARLIN" , "1" )
143143
144144 with vllm_runner (
145- "facebook/opt-125m" , quantization = "fp8" , kv_cache_dtype = kv_cache_dtype
145+ "facebook/opt-125m" ,
146+ quantization = "fp8" ,
147+ enforce_eager = True ,
148+ kv_cache_dtype = kv_cache_dtype ,
146149 ) as llm :
147150
148151 def check_model (model ):
0 commit comments