change commit and fix send_delta_data

wxsIcey · wxsIcey · commit 98c3882a13f8 · 2025-10-25T08:41:38.000Z
Signed-off-by: Icey &lt;1790571317@qq.com&gt;
diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
+          VLLM_COMMIT=83f478bb19489b41e9d208b47b4bb5a95ac171ac
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
+      vllm: 83f478bb19489b41e9d208b47b4bb5a95ac171ac
 
   changes:
     runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
+        vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0]
     steps:
       - name: Install packages
         run: |
@@ -140,7 +140,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
+        vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml
@@ -69,7 +69,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
+        vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -142,24 +142,47 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "Non-MLA LLMs forcibly disable the chunked prefill feature,"
                 "as the performance of operators supporting this feature "
                 "functionality is currently suboptimal.")
-            if not model_config.is_multimodal_model and \
-                structured_outputs_config.backend == "auto" and \
-                not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
-                scheduler_config.policy == "fcfs":
-                ascend_scheduler_config.enabled = True
-                chunked_prefill_enabled_in_ascend_scheduler = getattr(
-                    ascend_scheduler_config, "enable_chunked_prefill", False)
-                if chunked_prefill_enabled_in_ascend_scheduler:
-                    logger.warning(
-                        "Chunked prefill feature is enabled in ascend_scheduler,"
-                        "but note that the operator supporting this feature "
-                        "would lead to performance degradation.")
-                # In this situation, max_num_batched_tokens would have been rewritten.
-                # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
-                if (scheduler_config.max_num_batched_tokens
-                        < scheduler_config.max_model_len
-                        and not chunked_prefill_enabled_in_ascend_scheduler):
-                    scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+            if vllm_version_is("0.11.0"):
+                if not model_config.is_multimodal_model and \
+                    structured_outputs_config.backend == "auto" and \
+                    not scheduler_config.send_delta_data and \
+                    not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
+                    scheduler_config.policy == "fcfs":
+                    ascend_scheduler_config.enabled = True
+                    chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                        ascend_scheduler_config, "enable_chunked_prefill",
+                        False)
+                    if chunked_prefill_enabled_in_ascend_scheduler:
+                        logger.warning(
+                            "Chunked prefill feature is enabled in ascend_scheduler,"
+                            "but note that the operator supporting this feature "
+                            "would lead to performance degradation.")
+                    # In this situation, max_num_batched_tokens would have been rewritten.
+                    # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                    if (scheduler_config.max_num_batched_tokens
+                            < scheduler_config.max_model_len and
+                            not chunked_prefill_enabled_in_ascend_scheduler):
+                        scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+            else:
+                if not model_config.is_multimodal_model and \
+                    structured_outputs_config.backend == "auto" and \
+                    not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
+                    scheduler_config.policy == "fcfs":
+                    ascend_scheduler_config.enabled = True
+                    chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                        ascend_scheduler_config, "enable_chunked_prefill",
+                        False)
+                    if chunked_prefill_enabled_in_ascend_scheduler:
+                        logger.warning(
+                            "Chunked prefill feature is enabled in ascend_scheduler,"
+                            "but note that the operator supporting this feature "
+                            "would lead to performance degradation.")
+                    # In this situation, max_num_batched_tokens would have been rewritten.
+                    # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                    if (scheduler_config.max_num_batched_tokens
+                            < scheduler_config.max_model_len and
+                            not chunked_prefill_enabled_in_ascend_scheduler):
+                        scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
 
         kv_cache_dtype = vllm_config.additional_config.get(
             "kv_cache_dtype", None)