Skip to content

Commit 34ad958

Browse files
authored
Merge pull request #2427 from BerriAI/litellm_address_memory_usage
tests - monitor memory usage with litellm
2 parents 4c2dbf7 + 1917ee7 commit 34ad958

File tree

5 files changed

+283
-0
lines changed

5 files changed

+283
-0
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from fastapi import FastAPI
2+
import uvicorn
3+
from memory_profiler import profile, memory_usage
4+
import os
5+
import traceback
6+
import asyncio
7+
import pytest
8+
import litellm
9+
from litellm import Router
10+
from concurrent.futures import ThreadPoolExecutor
11+
from collections import defaultdict
12+
from dotenv import load_dotenv
13+
import uuid
14+
15+
load_dotenv()
16+
17+
model_list = [
18+
{
19+
"model_name": "gpt-3.5-turbo",
20+
"litellm_params": {
21+
"model": "azure/chatgpt-v-2",
22+
"api_key": os.getenv("AZURE_API_KEY"),
23+
"api_version": os.getenv("AZURE_API_VERSION"),
24+
"api_base": os.getenv("AZURE_API_BASE"),
25+
},
26+
"tpm": 240000,
27+
"rpm": 1800,
28+
},
29+
{
30+
"model_name": "text-embedding-ada-002",
31+
"litellm_params": {
32+
"model": "azure/azure-embedding-model",
33+
"api_key": os.getenv("AZURE_API_KEY"),
34+
"api_base": os.getenv("AZURE_API_BASE"),
35+
},
36+
"tpm": 100000,
37+
"rpm": 10000,
38+
},
39+
]
40+
41+
litellm.set_verbose = True
42+
litellm.cache = litellm.Cache(
43+
type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
44+
)
45+
router = Router(model_list=model_list, set_verbose=True)
46+
47+
app = FastAPI()
48+
49+
50+
@app.get("/")
51+
async def read_root():
52+
return {"message": "Welcome to the FastAPI endpoint!"}
53+
54+
55+
@profile
56+
@app.post("/router_acompletion")
57+
async def router_acompletion():
58+
question = f"This is a test: {uuid.uuid4()}" * 100
59+
resp = await router.aembedding(model="text-embedding-ada-002", input=question)
60+
print("embedding-resp", resp)
61+
62+
response = await router.acompletion(
63+
model="gpt-3.5-turbo", messages=[{"role": "user", "content": question}]
64+
)
65+
print("completion-resp", response)
66+
return response
67+
68+
69+
if __name__ == "__main__":
70+
uvicorn.run(app, host="0.0.0.0", port=8000)
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#### What this tests ####
2+
3+
from memory_profiler import profile, memory_usage
4+
import sys, os, time
5+
import traceback, asyncio
6+
import pytest
7+
8+
sys.path.insert(
9+
0, os.path.abspath("../..")
10+
) # Adds the parent directory to the system path
11+
import litellm
12+
from litellm import Router
13+
from concurrent.futures import ThreadPoolExecutor
14+
from collections import defaultdict
15+
from dotenv import load_dotenv
16+
import uuid
17+
18+
load_dotenv()
19+
20+
21+
model_list = [
22+
{
23+
"model_name": "gpt-3.5-turbo", # openai model name
24+
"litellm_params": { # params for litellm completion/embedding call
25+
"model": "azure/chatgpt-v-2",
26+
"api_key": os.getenv("AZURE_API_KEY"),
27+
"api_version": os.getenv("AZURE_API_VERSION"),
28+
"api_base": os.getenv("AZURE_API_BASE"),
29+
},
30+
"tpm": 240000,
31+
"rpm": 1800,
32+
},
33+
{
34+
"model_name": "text-embedding-ada-002",
35+
"litellm_params": {
36+
"model": "azure/azure-embedding-model",
37+
"api_key": os.environ["AZURE_API_KEY"],
38+
"api_base": os.environ["AZURE_API_BASE"],
39+
},
40+
"tpm": 100000,
41+
"rpm": 10000,
42+
},
43+
]
44+
litellm.set_verbose = True
45+
litellm.cache = litellm.Cache(
46+
type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
47+
)
48+
router = Router(
49+
model_list=model_list,
50+
set_verbose=True,
51+
) # type: ignore
52+
53+
54+
@profile
55+
async def router_acompletion():
56+
# embedding call
57+
question = f"This is a test: {uuid.uuid4()}" * 100
58+
resp = await router.aembedding(model="text-embedding-ada-002", input=question)
59+
print("embedding-resp", resp)
60+
61+
response = await router.acompletion(
62+
model="gpt-3.5-turbo", messages=[{"role": "user", "content": question}]
63+
)
64+
print("completion-resp", response)
65+
return response
66+
67+
68+
async def main():
69+
for i in range(1):
70+
start = time.time()
71+
n = 50 # Number of concurrent tasks
72+
tasks = [router_acompletion() for _ in range(n)]
73+
74+
chat_completions = await asyncio.gather(*tasks)
75+
76+
successful_completions = [c for c in chat_completions if c is not None]
77+
78+
# Write errors to error_log.txt
79+
with open("error_log.txt", "a") as error_log:
80+
for completion in chat_completions:
81+
if isinstance(completion, str):
82+
error_log.write(completion + "\n")
83+
84+
print(n, time.time() - start, len(successful_completions))
85+
time.sleep(10)
86+
87+
88+
if __name__ == "__main__":
89+
# Blank out contents of error_log.txt
90+
open("error_log.txt", "w").close()
91+
92+
asyncio.run(main())
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#### What this tests ####
2+
3+
from memory_profiler import profile, memory_usage
4+
import sys, os, time
5+
import traceback, asyncio
6+
import pytest
7+
8+
sys.path.insert(
9+
0, os.path.abspath("../..")
10+
) # Adds the parent directory to the system path
11+
import litellm
12+
from litellm import Router
13+
from concurrent.futures import ThreadPoolExecutor
14+
from collections import defaultdict
15+
from dotenv import load_dotenv
16+
import uuid
17+
18+
load_dotenv()
19+
20+
21+
model_list = [
22+
{
23+
"model_name": "gpt-3.5-turbo", # openai model name
24+
"litellm_params": { # params for litellm completion/embedding call
25+
"model": "azure/chatgpt-v-2",
26+
"api_key": os.getenv("AZURE_API_KEY"),
27+
"api_version": os.getenv("AZURE_API_VERSION"),
28+
"api_base": os.getenv("AZURE_API_BASE"),
29+
},
30+
"tpm": 240000,
31+
"rpm": 1800,
32+
},
33+
{
34+
"model_name": "text-embedding-ada-002",
35+
"litellm_params": {
36+
"model": "azure/azure-embedding-model",
37+
"api_key": os.environ["AZURE_API_KEY"],
38+
"api_base": os.environ["AZURE_API_BASE"],
39+
},
40+
"tpm": 100000,
41+
"rpm": 10000,
42+
},
43+
]
44+
litellm.set_verbose = True
45+
litellm.cache = litellm.Cache(
46+
type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
47+
)
48+
router = Router(
49+
model_list=model_list,
50+
set_verbose=True,
51+
) # type: ignore
52+
53+
54+
@profile
55+
async def router_acompletion():
56+
# embedding call
57+
question = f"This is a test: {uuid.uuid4()}" * 100
58+
resp = await router.aembedding(model="text-embedding-ada-002", input=question)
59+
print("embedding-resp", resp)
60+
61+
response = await router.acompletion(
62+
model="gpt-3.5-turbo", messages=[{"role": "user", "content": question}]
63+
)
64+
print("completion-resp", response)
65+
return response
66+
67+
68+
async def main():
69+
for i in range(1):
70+
start = time.time()
71+
n = 50 # Number of concurrent tasks
72+
tasks = [router_acompletion() for _ in range(n)]
73+
74+
chat_completions = await asyncio.gather(*tasks)
75+
76+
successful_completions = [c for c in chat_completions if c is not None]
77+
78+
# Write errors to error_log.txt
79+
with open("error_log.txt", "a") as error_log:
80+
for completion in chat_completions:
81+
if isinstance(completion, str):
82+
error_log.write(completion + "\n")
83+
84+
print(n, time.time() - start, len(successful_completions))
85+
time.sleep(10)
86+
87+
88+
if __name__ == "__main__":
89+
# Blank out contents of error_log.txt
90+
open("error_log.txt", "w").close()
91+
92+
asyncio.run(main())
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import requests
2+
from concurrent.futures import ThreadPoolExecutor
3+
4+
# Replace the URL with your actual endpoint
5+
url = "http://localhost:8000/router_acompletion"
6+
7+
8+
def make_request(session):
9+
headers = {"Content-Type": "application/json"}
10+
data = {} # Replace with your JSON payload if needed
11+
12+
response = session.post(url, headers=headers, json=data)
13+
print(f"Status code: {response.status_code}")
14+
15+
16+
# Number of concurrent requests
17+
num_requests = 20
18+
19+
# Create a session to reuse the underlying TCP connection
20+
with requests.Session() as session:
21+
# Use ThreadPoolExecutor for concurrent requests
22+
with ThreadPoolExecutor(max_workers=num_requests) as executor:
23+
# Use list comprehension to submit tasks
24+
futures = [executor.submit(make_request, session) for _ in range(num_requests)]
25+
26+
# Wait for all futures to complete
27+
for future in futures:
28+
future.result()

litellm/caching.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def __init__(self):
4848
self.ttl_dict = {}
4949

5050
def set_cache(self, key, value, **kwargs):
51+
print_verbose("InMemoryCache: set_cache")
5152
self.cache_dict[key] = value
5253
if "ttl" in kwargs:
5354
self.ttl_dict[key] = time.time() + kwargs["ttl"]

0 commit comments

Comments
 (0)