Skip to content

Commit e9ab033

Browse files
authored
Merge branch 'main' into fix/validate-tool-requests-29432
2 parents d9f63e9 + e94384b commit e9ab033

File tree

143 files changed

+2562
-1252
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+2562
-1252
lines changed

.buildkite/test-amd.yaml

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ steps:
6161
- pytest -v -s -m 'not cpu_test' multimodal
6262
- pytest -v -s utils_
6363

64-
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
65-
timeout_in_minutes: 20
64+
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
65+
timeout_in_minutes: 30
6666
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
6767
agent_pool: mi325_1
6868
grade: Blocking
@@ -73,6 +73,7 @@ steps:
7373
- tests/multimodal
7474
- tests/standalone_tests/lazy_imports.py
7575
- tests/tokenizers_
76+
- tests/tool_parsers
7677
- tests/transformers_utils
7778
- tests/config
7879
no_gpu: true
@@ -82,6 +83,7 @@ steps:
8283
- pytest -v -s test_outputs.py
8384
- pytest -v -s -m 'cpu_test' multimodal
8485
- pytest -v -s tokenizers_
86+
- pytest -v -s tool_parsers
8587
- pytest -v -s transformers_utils
8688
- pytest -v -s config
8789

@@ -759,19 +761,7 @@ steps:
759761
- vllm/
760762
- tests/tool_use
761763
commands:
762-
- pytest -v -s -m 'not cpu_test' tool_use
763-
764-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
765-
mirror_hardwares: [amdexperimental, amdproduction]
766-
agent_pool: mi325_1
767-
# grade: Blocking
768-
timeout_in_minutes: 10
769-
source_file_dependencies:
770-
- vllm/
771-
- tests/tool_use
772-
no_gpu: true
773-
commands:
774-
- pytest -v -s -m 'cpu_test' tool_use
764+
- pytest -v -s tool_use
775765

776766
##### models test #####
777767

.buildkite/test-pipeline.yaml

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,16 @@ steps:
5757
- pytest -v -s -m 'not cpu_test' multimodal
5858
- pytest -v -s utils_
5959

60-
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
61-
timeout_in_minutes: 20
60+
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
61+
timeout_in_minutes: 30
6262
source_file_dependencies:
6363
- vllm/
6464
- tests/test_inputs.py
6565
- tests/test_outputs.py
6666
- tests/multimodal
6767
- tests/standalone_tests/lazy_imports.py
6868
- tests/tokenizers_
69+
- tests/tool_parsers
6970
- tests/transformers_utils
7071
- tests/config
7172
no_gpu: true
@@ -75,6 +76,7 @@ steps:
7576
- pytest -v -s test_outputs.py
7677
- pytest -v -s -m 'cpu_test' multimodal
7778
- pytest -v -s tokenizers_
79+
- pytest -v -s tool_parsers
7880
- pytest -v -s transformers_utils
7981
- pytest -v -s config
8082

@@ -672,16 +674,7 @@ steps:
672674
- vllm/
673675
- tests/tool_use
674676
commands:
675-
- pytest -v -s -m 'not cpu_test' tool_use
676-
677-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
678-
timeout_in_minutes: 10
679-
source_file_dependencies:
680-
- vllm/
681-
- tests/tool_use
682-
no_gpu: true
683-
commands:
684-
- pytest -v -s -m 'cpu_test' tool_use
677+
- pytest -v -s tool_use
685678

686679
##### models test #####
687680

.buildkite/test_areas/misc.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,15 @@ steps:
115115

116116
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
117117
depends_on: ~
118-
timeout_in_minutes: 20
118+
timeout_in_minutes: 30
119119
source_file_dependencies:
120120
- vllm/
121121
- tests/test_inputs.py
122122
- tests/test_outputs.py
123123
- tests/multimodal
124124
- tests/standalone_tests/lazy_imports.py
125125
- tests/tokenizers_
126+
- tests/tool_parsers
126127
- tests/transformers_utils
127128
- tests/config
128129
no_gpu: true
@@ -132,6 +133,7 @@ steps:
132133
- pytest -v -s test_outputs.py
133134
- pytest -v -s -m 'cpu_test' multimodal
134135
- pytest -v -s tokenizers_
136+
- pytest -v -s tool_parsers
135137
- pytest -v -s transformers_utils
136138
- pytest -v -s config
137139

.buildkite/test_areas/tool_use.yaml

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,4 @@ steps:
1010
- vllm/
1111
- tests/tool_use
1212
commands:
13-
- pytest -v -s -m 'not cpu_test' tool_use
14-
15-
- label: OpenAI-Compatible Tool Use (CPU)
16-
depends_on: ~
17-
timeout_in_minutes: 10
18-
source_file_dependencies:
19-
- vllm/
20-
- tests/tool_use
21-
no_gpu: true
22-
commands:
23-
- pytest -v -s -m 'cpu_test' tool_use
13+
- pytest -v -s tool_use

benchmarks/auto_tune/auto_tune.sh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
1818
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
1919
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
2020
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
21+
HOSTNAME=$(hostname)
22+
if [[ -z "$HOSTNAME" ]]; then
23+
echo "Error: Failed to determine hostname." >&2
24+
exit 1
25+
fi
2126

2227
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
2328
RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
8287
"$MODEL"
8388
"--disable-log-requests"
8489
"--port" "8004"
90+
"--host" "$HOSTNAME"
8591
"--gpu-memory-utilization" "$gpu_memory_utilization"
8692
"--max-num-seqs" "$max_num_seqs"
8793
"--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -113,7 +119,7 @@ start_server() {
113119
# since that we should always have permission to send signal to the server process.
114120
kill -0 $server_pid 2> /dev/null || break
115121

116-
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
122+
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
117123
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
118124
if [[ "$STATUS_CODE" -eq 200 ]]; then
119125
server_started=1
@@ -173,6 +179,7 @@ run_benchmark() {
173179
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
174180
--num-prompts 1000 \
175181
--random-prefix-len $prefix_len \
182+
--host "$HOSTNAME" \
176183
--port 8004 &> "$bm_log"
177184
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
178185
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -188,7 +195,7 @@ run_benchmark() {
188195
request_rate=$((${throughput%.*} + 1))
189196
while ((request_rate > 0)); do
190197
# clear prefix cache
191-
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
198+
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
192199
sleep 5
193200
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
194201
vllm bench serve \
@@ -204,6 +211,7 @@ run_benchmark() {
204211
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
205212
--num-prompts 100 \
206213
--random-prefix-len $prefix_len \
214+
--host "$HOSTNAME" \
207215
--port 8004 &> "$bm_log"
208216
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
209217
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -304,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
304312
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
305313
--num-prompts 100 \
306314
--random-prefix-len $prefix_len \
315+
--host "$HOSTNAME" \
307316
--port 8004 \
308317
--profile &> "$bm_log"
309318
else

cmake/external_projects/flashmla.cmake

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
3535
# sm90a
3636

3737
set(SUPPORT_ARCHS)
38-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
39-
list(APPEND SUPPORT_ARCHS 9.0a)
38+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
39+
list(APPEND SUPPORT_ARCHS "9.0a")
4040
endif()
41-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
42-
list(APPEND SUPPORT_ARCHS 10.0a)
41+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
42+
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
43+
# this supports all compute_10x family
44+
list(APPEND SUPPORT_ARCHS "10.0f")
45+
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
46+
list(APPEND SUPPORT_ARCHS "10.0a")
4347
endif()
4448

4549

4650
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
4751
if(FLASH_MLA_ARCHS)
52+
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
4853
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
4954
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
5055

@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
126131
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
127132
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
128133
else()
129-
# Create empty targets for setup.py when not targeting sm90a systems
134+
message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
135+
# Create empty targets for setup.py on unsupported systems
130136
add_custom_target(_flashmla_C)
131137
add_custom_target(_flashmla_extension_C)
132138
endif()

0 commit comments

Comments
 (0)