Skip to content

Commit 55dcade

Browse files
authored
[Feat] Add VertexAI qwen API Service (#13828)
* add support for vertex AI QWEN API * streaming QWEN API support * test_partner_models_httpx * test_partner_models_httpx_streaming * add cost tracking for vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maa * docs qwen models vertexAI
1 parent 5299e9a commit 55dcade

File tree

5 files changed

+169
-14
lines changed

5 files changed

+169
-14
lines changed

docs/my-website/docs/providers/vertex_partner.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import TabItem from '@theme/TabItem';
1414
| Meta/Llama | `vertex_ai/meta/{MODEL}` | [Vertex AI - Meta Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama) |
1515
| Mistral | `vertex_ai/mistral-*` | [Vertex AI - Mistral Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/mistral) |
1616
| AI21 (Jamba) | `vertex_ai/jamba-*` | [Vertex AI - AI21 Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/ai21) |
17+
| Qwen | `vertex_ai/qwen/*` | [Vertex AI - Qwen Models](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/qwen) |
1718
| Model Garden | `vertex_ai/openai/{MODEL_ID}` or `vertex_ai/{MODEL_ID}` | [Vertex Model Garden](https://cloud.google.com/model-garden?hl=en) |
1819

1920
## Vertex AI - Anthropic (Claude)
@@ -571,6 +572,92 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
571572
</Tabs>
572573

573574

575+
## VertexAI Qwen API
576+
577+
| Property | Details |
578+
|----------|---------|
579+
| Provider Route | `vertex_ai/qwen/{MODEL}` |
580+
| Vertex Documentation | [Vertex AI - Qwen Models](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/qwen) |
581+
582+
**LiteLLM Supports all Vertex AI Qwen Models.** Ensure you use the `vertex_ai/qwen/` prefix for all Vertex AI Qwen models.
583+
584+
| Model Name | Usage |
585+
|------------------|------------------------------|
586+
| vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas | `completion('vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas', messages)` |
587+
| vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas | `completion('vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas', messages)` |
588+
589+
#### Usage
590+
591+
<Tabs>
592+
<TabItem value="sdk" label="SDK">
593+
594+
```python
595+
from litellm import completion
596+
import os
597+
598+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
599+
600+
model = "qwen/qwen3-coder-480b-a35b-instruct-maas"
601+
602+
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
603+
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
604+
605+
response = completion(
606+
model="vertex_ai/" + model,
607+
messages=[{"role": "user", "content": "hi"}],
608+
vertex_ai_project=vertex_ai_project,
609+
vertex_ai_location=vertex_ai_location,
610+
)
611+
print("\nModel Response", response)
612+
```
613+
</TabItem>
614+
<TabItem value="proxy" label="Proxy">
615+
616+
**1. Add to config**
617+
618+
```yaml
619+
model_list:
620+
- model_name: vertex-qwen
621+
litellm_params:
622+
model: vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas
623+
vertex_ai_project: "my-test-project"
624+
vertex_ai_location: "us-east-1"
625+
- model_name: vertex-qwen
626+
litellm_params:
627+
model: vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas
628+
vertex_ai_project: "my-test-project"
629+
vertex_ai_location: "us-west-1"
630+
```
631+
632+
**2. Start proxy**
633+
634+
```bash
635+
litellm --config /path/to/config.yaml
636+
637+
# RUNNING at http://0.0.0.0:4000
638+
```
639+
640+
**3. Test it!**
641+
642+
```bash
643+
curl --location 'http://0.0.0.0:4000/chat/completions' \
644+
--header 'Authorization: Bearer sk-1234' \
645+
--header 'Content-Type: application/json' \
646+
--data '{
647+
"model": "vertex-qwen", # 👈 the 'model_name' in config
648+
"messages": [
649+
{
650+
"role": "user",
651+
"content": "what llm are you"
652+
}
653+
],
654+
}'
655+
```
656+
657+
</TabItem>
658+
</Tabs>
659+
660+
574661
## Model Garden
575662

576663
:::tip

litellm/llms/vertex_ai/vertex_ai_partner_models/main.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,21 @@ def is_vertex_partner_model(model: str):
4848
or model.startswith("codestral")
4949
or model.startswith("jamba")
5050
or model.startswith("claude")
51+
or model.startswith("qwen")
5152
):
5253
return True
5354
return False
55+
56+
@staticmethod
57+
def should_use_openai_handler(model: str):
58+
OPENAI_LIKE_VERTEX_PROVIDERS = [
59+
"llama",
60+
"deepseek-ai",
61+
"qwen",
62+
]
63+
if any(provider in model for provider in OPENAI_LIKE_VERTEX_PROVIDERS):
64+
return True
65+
return False
5466

5567
def completion(
5668
self,
@@ -115,7 +127,7 @@ def completion(
115127

116128
optional_params["stream"] = stream
117129

118-
if "llama" in model or "deepseek-ai" in model:
130+
if self.should_use_openai_handler(model):
119131
partner = VertexPartnerProvider.llama
120132
elif "mistral" in model or "codestral" in model:
121133
partner = VertexPartnerProvider.mistralai
@@ -191,7 +203,7 @@ def completion(
191203
client=client,
192204
custom_llm_provider=LlmProviders.VERTEX_AI.value,
193205
)
194-
elif "llama" in model:
206+
elif self.should_use_openai_handler(model):
195207
return base_llm_http_handler.completion(
196208
model=model,
197209
stream=stream,

litellm/model_prices_and_context_window_backup.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9723,6 +9723,30 @@
97239723
"supports_tool_choice": true,
97249724
"supports_prompt_caching": true
97259725
},
9726+
"vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas": {
9727+
"max_tokens": 32768,
9728+
"max_input_tokens": 262144,
9729+
"max_output_tokens": 32768,
9730+
"input_cost_per_token": 1e-06,
9731+
"output_cost_per_token": 4e-06,
9732+
"litellm_provider": "vertex_ai-qwen_models",
9733+
"mode": "chat",
9734+
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
9735+
"supports_function_calling": true,
9736+
"supports_tool_choice": true
9737+
},
9738+
"vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas": {
9739+
"max_tokens": 16384,
9740+
"max_input_tokens": 262144,
9741+
"max_output_tokens": 16384,
9742+
"input_cost_per_token": 0.25e-06,
9743+
"output_cost_per_token": 1e-06,
9744+
"litellm_provider": "vertex_ai-qwen_models",
9745+
"mode": "chat",
9746+
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
9747+
"supports_function_calling": true,
9748+
"supports_tool_choice": true
9749+
},
97269750
"vertex_ai/meta/llama3-405b-instruct-maas": {
97279751
"max_tokens": 32000,
97289752
"max_input_tokens": 32000,

model_prices_and_context_window.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9723,6 +9723,30 @@
97239723
"supports_tool_choice": true,
97249724
"supports_prompt_caching": true
97259725
},
9726+
"vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas": {
9727+
"max_tokens": 32768,
9728+
"max_input_tokens": 262144,
9729+
"max_output_tokens": 32768,
9730+
"input_cost_per_token": 1e-06,
9731+
"output_cost_per_token": 4e-06,
9732+
"litellm_provider": "vertex_ai-qwen_models",
9733+
"mode": "chat",
9734+
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
9735+
"supports_function_calling": true,
9736+
"supports_tool_choice": true
9737+
},
9738+
"vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas": {
9739+
"max_tokens": 16384,
9740+
"max_input_tokens": 262144,
9741+
"max_output_tokens": 16384,
9742+
"input_cost_per_token": 0.25e-06,
9743+
"output_cost_per_token": 1e-06,
9744+
"litellm_provider": "vertex_ai-qwen_models",
9745+
"mode": "chat",
9746+
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
9747+
"supports_function_calling": true,
9748+
"supports_tool_choice": true
9749+
},
97269750
"vertex_ai/meta/llama3-405b-instruct-maas": {
97279751
"max_tokens": 32000,
97289752
"max_input_tokens": 32000,

tests/local_testing/test_amazing_vertex_completion.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -835,20 +835,20 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):
835835

836836

837837
@pytest.mark.parametrize(
838-
"model",
838+
"model,region",
839839
[
840-
"vertex_ai/mistral-large-2411",
841-
"vertex_ai/mistral-nemo@2407",
842-
# "vertex_ai/meta/llama3-405b-instruct-maas",
843-
], #
844-
) # "vertex_ai",
840+
("vertex_ai/mistral-large-2411", "us-central1"),
841+
("vertex_ai/mistral-nemo@2407", "us-central1"),
842+
("vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas", "us-south1")
843+
],
844+
)
845845
@pytest.mark.parametrize(
846846
"sync_mode",
847847
[True, False],
848848
) #
849849
@pytest.mark.flaky(retries=3, delay=1)
850850
@pytest.mark.asyncio
851-
async def test_partner_models_httpx(model, sync_mode):
851+
async def test_partner_models_httpx(model, region, sync_mode):
852852
try:
853853
load_vertex_ai_credentials()
854854
litellm.set_verbose = True
@@ -869,6 +869,7 @@ async def test_partner_models_httpx(model, sync_mode):
869869
"model": model,
870870
"messages": messages,
871871
"timeout": 10,
872+
"vertex_ai_location": region,
872873
}
873874
if sync_mode:
874875
response = litellm.completion(**data)
@@ -881,26 +882,33 @@ async def test_partner_models_httpx(model, sync_mode):
881882

882883
assert isinstance(response._hidden_params["response_cost"], float)
883884
except litellm.RateLimitError as e:
885+
print("RateLimitError", e)
884886
pass
885887
except litellm.Timeout as e:
888+
print("Timeout", e)
886889
pass
887890
except litellm.InternalServerError as e:
891+
print("InternalServerError", e)
888892
pass
889893
except litellm.APIConnectionError as e:
894+
print("APIConnectionError", e)
890895
pass
891896
except litellm.ServiceUnavailableError as e:
897+
print("ServiceUnavailableError", e)
892898
pass
893899
except Exception as e:
900+
print("got generic exception", e)
894901
if "429 Quota exceeded" in str(e):
895902
pass
896903
else:
897904
pytest.fail("An unexpected exception occurred - {}".format(str(e)))
898905

899906

900907
@pytest.mark.parametrize(
901-
"model",
908+
"model,region",
902909
[
903-
"vertex_ai/meta/llama-4-scout-17b-16e-instruct-maas",
910+
("vertex_ai/meta/llama-4-scout-17b-16e-instruct-maas", "us-east5"),
911+
("vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas", "us-south1"),
904912
],
905913
)
906914
@pytest.mark.parametrize(
@@ -909,9 +917,9 @@ async def test_partner_models_httpx(model, sync_mode):
909917
) #
910918
@pytest.mark.asyncio
911919
@pytest.mark.flaky(retries=3, delay=1)
912-
async def test_partner_models_httpx_streaming(model, sync_mode):
920+
async def test_partner_models_httpx_streaming(model, region, sync_mode):
913921
try:
914-
load_vertex_ai_credentials()
922+
#load_vertex_ai_credentials()
915923
litellm._turn_on_debug()
916924

917925
messages = [
@@ -930,7 +938,7 @@ async def test_partner_models_httpx_streaming(model, sync_mode):
930938
"model": model,
931939
"messages": messages,
932940
"stream": True,
933-
"vertex_ai_location": "us-east5",
941+
"vertex_ai_location": region,
934942
}
935943
if sync_mode:
936944
response = litellm.completion(**data)

0 commit comments

Comments
 (0)