Skip to content

Commit 69ac35f

Browse files
committed
add eval config for Qwen3-235B-A22B-Thinking-2507-FP8
Signed-off-by: Huamin Li <[email protected]>
1 parent 69f0640 commit 69ac35f

File tree

5 files changed

+38
-6
lines changed

5 files changed

+38
-6
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
2+
tasks:
3+
- name: "mmlu_pro"
4+
metrics:
5+
- name: "exact_match,custom-extract"
6+
value: 0.82
7+
limit: 250 # will run on 250 * 14 subjects = 3500 samples
8+
num_fewshot: 5
9+
enforce_eager: false
10+
max_model_len: 49152
11+
apply_chat_template: true
12+
fewshot_as_multiturn: true
13+
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=6144,until=<|ENDANSWER|>"

.buildkite/lm-eval-harness/configs/models-large-h100.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Qwen3-235B-A22B-Instruct-2507-FP8.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ def launch_lm_eval(eval_config, tp_size):
2121
max_model_len = eval_config.get("max_model_len", 4096)
2222
batch_size = eval_config.get("batch_size", "auto")
2323
backend = eval_config.get("backend", "vllm")
24+
enforce_eager = eval_config.get("enforce_eager", "true")
2425
model_args = (
2526
f"pretrained={eval_config['model_name']},"
2627
f"tensor_parallel_size={tp_size},"
27-
f"enforce_eager=true,"
28+
f"enforce_eager={enforce_eager},"
2829
f"add_bos_token=true,"
2930
f"trust_remote_code={trust_remote_code},"
3031
f"max_model_len={max_model_len},"
@@ -37,8 +38,13 @@ def launch_lm_eval(eval_config, tp_size):
3738
limit=eval_config["limit"],
3839
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
3940
# text models. however, this is regressing measured strict-match for
40-
# existing text models in CI, so only apply it for mm.
41-
apply_chat_template=backend == "vllm-vlm",
41+
# existing text models in CI, so only apply it for mm, or explicitly set
42+
apply_chat_template=eval_config.get(
43+
"apply_chat_template", backend == "vllm-vlm"
44+
),
45+
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
46+
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
47+
gen_kwargs=eval_config.get("gen_kwargs"),
4248
batch_size=batch_size,
4349
)
4450
return results

.buildkite/test-pipeline.yaml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ steps:
392392
--ignore=lora/test_deepseekv2_tp.py \
393393
--ignore=lora/test_gptoss.py \
394394
--ignore=lora/test_qwen3moe_tp.py
395-
395+
396396
parallelism: 4
397397

398398
- label: PyTorch Compilation Unit Tests # 15min
@@ -1115,7 +1115,7 @@ steps:
11151115
- tests/weight_loading
11161116
commands:
11171117
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1118-
1118+
11191119
- label: NixlConnector PD accuracy tests (Distributed) # 30min
11201120
timeout_in_minutes: 30
11211121
working_dir: "/vllm-workspace/tests"
@@ -1157,6 +1157,19 @@ steps:
11571157
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
11581158
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
11591159

1160+
##### H100 test #####
1161+
- label: LM Eval Large Models (H100) # optional
1162+
gpu: h100
1163+
optional: true
1164+
num_gpus: 4
1165+
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1166+
source_file_dependencies:
1167+
- csrc/
1168+
- vllm/model_executor/layers/quantization
1169+
commands:
1170+
- export VLLM_USE_DEEP_GEMM=0
1171+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1172+
11601173
##### H200 test #####
11611174
- label: Distributed Tests (H200) # optional
11621175
gpu: h200

0 commit comments

Comments
 (0)