vllm-project
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 5 additions & 15 deletions b/‎.buildkite/test-amd.yaml‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 5 additions & 12 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎.buildkite/test_areas/misc.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.buildkite/test_areas/misc.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.buildkite/test_areas/tool_use.yaml‎
Lines changed: 1 addition & 11 deletions b/‎.buildkite/test_areas/tool_use.yaml‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎benchmarks/auto_tune/auto_tune.sh‎
Lines changed: 11 additions & 2 deletions b/‎benchmarks/auto_tune/auto_tune.sh‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎cmake/external_projects/flashmla.cmake‎
Lines changed: 11 additions & 5 deletions b/‎cmake/external_projects/flashmla.cmake‎
Lines changed: 11 additions & 5 deletions
@@ -61,8 +61,8 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
-  timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
   grade: Blocking
@@ -73,6 +73,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -82,6 +83,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -759,19 +761,7 @@ steps:
     - vllm/
     - tests/tool_use
   commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
 
 #####  models test  #####
 
 
@@ -57,15 +57,16 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
-  timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -75,6 +76,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -672,16 +674,7 @@ steps:
     - vllm/
     - tests/tool_use
   commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
 
 #####  models test  #####
 
 
@@ -115,14 +115,15 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
   depends_on: ~
-  timeout_in_minutes: 20
+  timeout_in_minutes: 30
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -132,6 +133,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
 
@@ -10,14 +10,4 @@ steps:
     - vllm/
     - tests/tool_use
   commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU)
-  depends_on: ~
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
 MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
 NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
 NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+    echo "Error: Failed to determine hostname." >&2
+    exit 1
+fi
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
         "$MODEL"
         "--disable-log-requests"
         "--port" "8004"
+        "--host" "$HOSTNAME"
         "--gpu-memory-utilization" "$gpu_memory_utilization"
         "--max-num-seqs" "$max_num_seqs"
         "--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -113,7 +119,7 @@ start_server() {
         # since that we should always have permission to send signal to the server process.
         kill -0 $server_pid 2> /dev/null || break
 
-        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
         STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
         if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
@@ -173,6 +179,7 @@ run_benchmark() {
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 1000 \
         --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
         --port 8004 &> "$bm_log"
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -188,7 +195,7 @@ run_benchmark() {
         request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
             # clear prefix cache
-            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
             sleep 5
             bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
             vllm bench serve \
@@ -204,6 +211,7 @@ run_benchmark() {
                 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                 --num-prompts 100 \
                 --random-prefix-len $prefix_len \
+                --host "$HOSTNAME" \
                 --port 8004 &> "$bm_log"
             throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -304,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 100 \
         --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
         --port 8004 \
         --profile &> "$bm_log"
 else
 
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # sm90a
 
 set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
-    list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+    list(APPEND SUPPORT_ARCHS "9.0a")
 endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
-    list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+    # this supports all compute_10x family
+    list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    list(APPEND SUPPORT_ARCHS "10.0a")
 endif()
 
 
 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
 if(FLASH_MLA_ARCHS)
+    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
     set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
     list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
 
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+    # Create empty targets for setup.py on unsupported systems
     add_custom_target(_flashmla_C)
     add_custom_target(_flashmla_extension_C)
 endif()