Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions docs/my-website/docs/providers/vertex_partner.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import TabItem from '@theme/TabItem';
| Meta/Llama | `vertex_ai/meta/{MODEL}` | [Vertex AI - Meta Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama) |
| Mistral | `vertex_ai/mistral-*` | [Vertex AI - Mistral Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/mistral) |
| AI21 (Jamba) | `vertex_ai/jamba-*` | [Vertex AI - AI21 Models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/ai21) |
| Qwen | `vertex_ai/qwen/*` | [Vertex AI - Qwen Models](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/qwen) |
| Model Garden | `vertex_ai/openai/{MODEL_ID}` or `vertex_ai/{MODEL_ID}` | [Vertex Model Garden](https://cloud.google.com/model-garden?hl=en) |

## Vertex AI - Anthropic (Claude)
Expand Down Expand Up @@ -571,6 +572,92 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</Tabs>


## VertexAI Qwen API

| Property | Details |
|----------|---------|
| Provider Route | `vertex_ai/qwen/{MODEL}` |
| Vertex Documentation | [Vertex AI - Qwen Models](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/qwen) |

**LiteLLM Supports all Vertex AI Qwen Models.** Ensure you use the `vertex_ai/qwen/` prefix for all Vertex AI Qwen models.

| Model Name | Usage |
|------------------|------------------------------|
| vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas | `completion('vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas', messages)` |
| vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas | `completion('vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas', messages)` |

#### Usage

<Tabs>
<TabItem value="sdk" label="SDK">

```python
from litellm import completion
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""

model = "qwen/qwen3-coder-480b-a35b-instruct-maas"

vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]

response = completion(
model="vertex_ai/" + model,
messages=[{"role": "user", "content": "hi"}],
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">

**1. Add to config**

```yaml
model_list:
- model_name: vertex-qwen
litellm_params:
model: vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: vertex-qwen
litellm_params:
model: vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```

**2. Start proxy**

```bash
litellm --config /path/to/config.yaml

# RUNNING at http://0.0.0.0:4000
```

**3. Test it!**

```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "vertex-qwen", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```

</TabItem>
</Tabs>


## Model Garden

:::tip
Expand Down
16 changes: 14 additions & 2 deletions litellm/llms/vertex_ai/vertex_ai_partner_models/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,21 @@ def is_vertex_partner_model(model: str):
or model.startswith("codestral")
or model.startswith("jamba")
or model.startswith("claude")
or model.startswith("qwen")
):
return True
return False

@staticmethod
def should_use_openai_handler(model: str):
OPENAI_LIKE_VERTEX_PROVIDERS = [
"llama",
"deepseek-ai",
"qwen",
]
if any(provider in model for provider in OPENAI_LIKE_VERTEX_PROVIDERS):
return True
return False

def completion(
self,
Expand Down Expand Up @@ -115,7 +127,7 @@ def completion(

optional_params["stream"] = stream

if "llama" in model or "deepseek-ai" in model:
if self.should_use_openai_handler(model):
partner = VertexPartnerProvider.llama
elif "mistral" in model or "codestral" in model:
partner = VertexPartnerProvider.mistralai
Expand Down Expand Up @@ -191,7 +203,7 @@ def completion(
client=client,
custom_llm_provider=LlmProviders.VERTEX_AI.value,
)
elif "llama" in model:
elif self.should_use_openai_handler(model):
return base_llm_http_handler.completion(
model=model,
stream=stream,
Expand Down
24 changes: 24 additions & 0 deletions litellm/model_prices_and_context_window_backup.json
Original file line number Diff line number Diff line change
Expand Up @@ -9723,6 +9723,30 @@
"supports_tool_choice": true,
"supports_prompt_caching": true
},
"vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas": {
"max_tokens": 32768,
"max_input_tokens": 262144,
"max_output_tokens": 32768,
"input_cost_per_token": 1e-06,
"output_cost_per_token": 4e-06,
"litellm_provider": "vertex_ai-qwen_models",
"mode": "chat",
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
"supports_function_calling": true,
"supports_tool_choice": true
},
"vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas": {
"max_tokens": 16384,
"max_input_tokens": 262144,
"max_output_tokens": 16384,
"input_cost_per_token": 0.25e-06,
"output_cost_per_token": 1e-06,
"litellm_provider": "vertex_ai-qwen_models",
"mode": "chat",
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
"supports_function_calling": true,
"supports_tool_choice": true
},
"vertex_ai/meta/llama3-405b-instruct-maas": {
"max_tokens": 32000,
"max_input_tokens": 32000,
Expand Down
24 changes: 24 additions & 0 deletions model_prices_and_context_window.json
Original file line number Diff line number Diff line change
Expand Up @@ -9723,6 +9723,30 @@
"supports_tool_choice": true,
"supports_prompt_caching": true
},
"vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas": {
"max_tokens": 32768,
"max_input_tokens": 262144,
"max_output_tokens": 32768,
"input_cost_per_token": 1e-06,
"output_cost_per_token": 4e-06,
"litellm_provider": "vertex_ai-qwen_models",
"mode": "chat",
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
"supports_function_calling": true,
"supports_tool_choice": true
},
"vertex_ai/qwen/qwen3-235b-a22b-instruct-2507-maas": {
"max_tokens": 16384,
"max_input_tokens": 262144,
"max_output_tokens": 16384,
"input_cost_per_token": 0.25e-06,
"output_cost_per_token": 1e-06,
"litellm_provider": "vertex_ai-qwen_models",
"mode": "chat",
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
"supports_function_calling": true,
"supports_tool_choice": true
},
"vertex_ai/meta/llama3-405b-instruct-maas": {
"max_tokens": 32000,
"max_input_tokens": 32000,
Expand Down
32 changes: 20 additions & 12 deletions tests/local_testing/test_amazing_vertex_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,20 +835,20 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):


@pytest.mark.parametrize(
"model",
"model,region",
[
"vertex_ai/mistral-large-2411",
"vertex_ai/mistral-nemo@2407",
# "vertex_ai/meta/llama3-405b-instruct-maas",
], #
) # "vertex_ai",
("vertex_ai/mistral-large-2411", "us-central1"),
("vertex_ai/mistral-nemo@2407", "us-central1"),
("vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas", "us-south1")
],
)
@pytest.mark.parametrize(
"sync_mode",
[True, False],
) #
@pytest.mark.flaky(retries=3, delay=1)
@pytest.mark.asyncio
async def test_partner_models_httpx(model, sync_mode):
async def test_partner_models_httpx(model, region, sync_mode):
try:
load_vertex_ai_credentials()
litellm.set_verbose = True
Expand All @@ -869,6 +869,7 @@ async def test_partner_models_httpx(model, sync_mode):
"model": model,
"messages": messages,
"timeout": 10,
"vertex_ai_location": region,
}
if sync_mode:
response = litellm.completion(**data)
Expand All @@ -881,26 +882,33 @@ async def test_partner_models_httpx(model, sync_mode):

assert isinstance(response._hidden_params["response_cost"], float)
except litellm.RateLimitError as e:
print("RateLimitError", e)
pass
except litellm.Timeout as e:
print("Timeout", e)
pass
except litellm.InternalServerError as e:
print("InternalServerError", e)
pass
except litellm.APIConnectionError as e:
print("APIConnectionError", e)
pass
except litellm.ServiceUnavailableError as e:
print("ServiceUnavailableError", e)
pass
except Exception as e:
print("got generic exception", e)
if "429 Quota exceeded" in str(e):
pass
else:
pytest.fail("An unexpected exception occurred - {}".format(str(e)))


@pytest.mark.parametrize(
"model",
"model,region",
[
"vertex_ai/meta/llama-4-scout-17b-16e-instruct-maas",
("vertex_ai/meta/llama-4-scout-17b-16e-instruct-maas", "us-east5"),
("vertex_ai/qwen/qwen3-coder-480b-a35b-instruct-maas", "us-south1"),
],
)
@pytest.mark.parametrize(
Expand All @@ -909,9 +917,9 @@ async def test_partner_models_httpx(model, sync_mode):
) #
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_partner_models_httpx_streaming(model, sync_mode):
async def test_partner_models_httpx_streaming(model, region, sync_mode):
try:
load_vertex_ai_credentials()
#load_vertex_ai_credentials()
litellm._turn_on_debug()

messages = [
Expand All @@ -930,7 +938,7 @@ async def test_partner_models_httpx_streaming(model, sync_mode):
"model": model,
"messages": messages,
"stream": True,
"vertex_ai_location": "us-east5",
"vertex_ai_location": region,
}
if sync_mode:
response = litellm.completion(**data)
Expand Down
Loading