[Feat] Add VertexAI qwen API Service (#13828)

ishaan-jaff · web-flow · commit 55dcaded7244 · 2025-08-20T15:00:33.000-07:00
* add support for vertex AI QWEN API

* streaming QWEN API support

* test_partner_models_httpx

* test_partner_models_httpx_streaming

* add cost tracking for vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maa

* docs qwen models vertexAI
diff --git a/docs/my-website/docs/providers/vertex_partner.md b/docs/my-website/docs/providers/vertex_partner.md
@@ -14,6 +14,7 @@ import TabItem from '@theme/TabItem';
 | Meta/Llama | `vertex_ai/meta/{MODEL}` | [Vertex AI - Meta Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama) |
 | Mistral | `vertex_ai/mistral-*` | [Vertex AI - Mistral Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/mistral) |
 | AI21 (Jamba) | `vertex_ai/jamba-*` | [Vertex AI - AI21 Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/ai21) |
+| Qwen | `vertex_ai/qwen/*` | [Vertex AI - Qwen Models](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/qwen) |
 | Model Garden | `vertex_ai/openai/{MODEL_ID}` or `vertex_ai/{MODEL_ID}` | [Vertex Model Garden](https://cloud.google.com/model-garden?hl=en) |
 
 ## Vertex AI - Anthropic (Claude)
@@ -571,6 +572,92 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>
 
 
+## VertexAI Qwen API
+
+| Property | Details |
+|----------|---------|
+| Provider Route | `vertex_ai/qwen/{MODEL}` |
+| Vertex Documentation | [Vertex AI - Qwen Models](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/qwen) |
+
+**LiteLLM Supports all Vertex AI Qwen Models.** Ensure you use the `vertex_ai/qwen/` prefix for all Vertex AI Qwen models.
+
+| Model Name       | Usage                        |
+|------------------|------------------------------|
+| vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas | `completion('vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas', messages)` |
+| vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas | `completion('vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas', messages)` |
+
+#### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
+
+model = "qwen/qwen3-coder-480b-a35b-instruct-maas"
+
+vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
+vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
+
+response = completion(
+    model="vertex_ai/" + model,
+    messages=[{"role": "user", "content": "hi"}],
+    vertex_ai_project=vertex_ai_project,
+    vertex_ai_location=vertex_ai_location,
+)
+print("\nModel Response", response)
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Add to config**
+
+```yaml
+model_list:
+    - model_name: vertex-qwen
+      litellm_params:
+        model: vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-east-1"
+    - model_name: vertex-qwen
+      litellm_params:
+        model: vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-west-1"
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING at http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+            "model": "vertex-qwen", # 👈 the 'model_name' in config
+            "messages": [
+                {
+                "role": "user",
+                "content": "what llm are you"
+                }
+            ],
+        }'
+```
+
+</TabItem>
+</Tabs>
+
+
 ## Model Garden
 
 :::tip
diff --git a/litellm/llms/vertex_ai/vertex_ai_partner_models/main.py b/litellm/llms/vertex_ai/vertex_ai_partner_models/main.py
@@ -48,9 +48,21 @@ def is_vertex_partner_model(model: str):
             or model.startswith("codestral")
             or model.startswith("jamba")
             or model.startswith("claude")
+            or model.startswith("qwen")
         ):
             return True
         return False
+    
+    @staticmethod
+    def should_use_openai_handler(model: str):
+        OPENAI_LIKE_VERTEX_PROVIDERS = [
+            "llama",
+            "deepseek-ai",
+            "qwen",
+        ]
+        if any(provider in model for provider in OPENAI_LIKE_VERTEX_PROVIDERS):
+            return True
+        return False
 
     def completion(
         self,
@@ -115,7 +127,7 @@ def completion(
 
             optional_params["stream"] = stream
 
-            if "llama" in model or "deepseek-ai" in model:
+            if self.should_use_openai_handler(model):
                 partner = VertexPartnerProvider.llama
             elif "mistral" in model or "codestral" in model:
                 partner = VertexPartnerProvider.mistralai
@@ -191,7 +203,7 @@ def completion(
                     client=client,
                     custom_llm_provider=LlmProviders.VERTEX_AI.value,
                 )
-            elif "llama" in model:
+            elif self.should_use_openai_handler(model):
                 return base_llm_http_handler.completion(
                     model=model,
                     stream=stream,
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -9723,6 +9723,30 @@
         "supports_tool_choice": true,
         "supports_prompt_caching": true
     },
+    "vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas": {
+        "max_tokens": 32768,
+        "max_input_tokens": 262144,
+        "max_output_tokens": 32768,
+        "input_cost_per_token": 1e-06,
+        "output_cost_per_token": 4e-06,
+        "litellm_provider": "vertex_ai-qwen_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
+    "vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas": {
+        "max_tokens": 16384,
+        "max_input_tokens": 262144,
+        "max_output_tokens": 16384,
+        "input_cost_per_token": 0.25e-06,
+        "output_cost_per_token": 1e-06,
+        "litellm_provider": "vertex_ai-qwen_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
     "vertex_ai/meta/llama3-405b-instruct-maas": {
         "max_tokens": 32000,
         "max_input_tokens": 32000,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
@@ -9723,6 +9723,30 @@
         "supports_tool_choice": true,
         "supports_prompt_caching": true
     },
+    "vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas": {
+        "max_tokens": 32768,
+        "max_input_tokens": 262144,
+        "max_output_tokens": 32768,
+        "input_cost_per_token": 1e-06,
+        "output_cost_per_token": 4e-06,
+        "litellm_provider": "vertex_ai-qwen_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
+    "vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas": {
+        "max_tokens": 16384,
+        "max_input_tokens": 262144,
+        "max_output_tokens": 16384,
+        "input_cost_per_token": 0.25e-06,
+        "output_cost_per_token": 1e-06,
+        "litellm_provider": "vertex_ai-qwen_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
     "vertex_ai/meta/llama3-405b-instruct-maas": {
         "max_tokens": 32000,
         "max_input_tokens": 32000,
diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
@@ -835,20 +835,20 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):
 
 
 @pytest.mark.parametrize(
-    "model",
+    "model,region",
     [
-        "vertex_ai/mistral-large-2411",
-        "vertex_ai/mistral-nemo@2407",
-        # "vertex_ai/meta/llama3-405b-instruct-maas",
-    ],  #
-)  # "vertex_ai",
+        ("vertex_ai/mistral-large-2411", "us-central1"),
+        ("vertex_ai/mistral-nemo@2407", "us-central1"),
+        ("vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas", "us-south1")
+    ],
+)
 @pytest.mark.parametrize(
     "sync_mode",
     [True, False],
 )  #
 @pytest.mark.flaky(retries=3, delay=1)
 @pytest.mark.asyncio
-async def test_partner_models_httpx(model, sync_mode):
+async def test_partner_models_httpx(model, region, sync_mode):
     try:
         load_vertex_ai_credentials()
         litellm.set_verbose = True
@@ -869,6 +869,7 @@ async def test_partner_models_httpx(model, sync_mode):
             "model": model,
             "messages": messages,
             "timeout": 10,
+            "vertex_ai_location": region,
         }
         if sync_mode:
             response = litellm.completion(**data)
@@ -881,26 +882,33 @@ async def test_partner_models_httpx(model, sync_mode):
 
         assert isinstance(response._hidden_params["response_cost"], float)
     except litellm.RateLimitError as e:
+        print("RateLimitError", e)
         pass
     except litellm.Timeout as e:
+        print("Timeout", e)
         pass
     except litellm.InternalServerError as e:
+        print("InternalServerError", e)
         pass
     except litellm.APIConnectionError as e:
+        print("APIConnectionError", e)
         pass
     except litellm.ServiceUnavailableError as e:
+        print("ServiceUnavailableError", e)
         pass
     except Exception as e:
+        print("got generic exception", e)
         if "429 Quota exceeded" in str(e):
             pass
         else:
             pytest.fail("An unexpected exception occurred - {}".format(str(e)))
 
 
 @pytest.mark.parametrize(
-    "model",
+    "model,region",
     [
-        "vertex_ai/meta/llama-4-scout-17b-16e-instruct-maas",
+        ("vertex_ai/meta/llama-4-scout-17b-16e-instruct-maas", "us-east5"),
+        ("vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas", "us-south1"),
     ],
 )
 @pytest.mark.parametrize(
@@ -909,9 +917,9 @@ async def test_partner_models_httpx(model, sync_mode):
 )  #
 @pytest.mark.asyncio
 @pytest.mark.flaky(retries=3, delay=1)
-async def test_partner_models_httpx_streaming(model, sync_mode):
+async def test_partner_models_httpx_streaming(model, region, sync_mode):
     try:
-        load_vertex_ai_credentials()
+        #load_vertex_ai_credentials()
         litellm._turn_on_debug()
 
         messages = [
@@ -930,7 +938,7 @@ async def test_partner_models_httpx_streaming(model, sync_mode):
             "model": model,
             "messages": messages,
             "stream": True,
-            "vertex_ai_location": "us-east5",
+            "vertex_ai_location": region,
         }
         if sync_mode:
             response = litellm.completion(**data)