Skip to content
This repository was archived by the owner on Mar 7, 2025. It is now read-only.

Commit 48bec2d

Browse files
authored
fix: update maximal local concurrency limit based on API response (#242)
1 parent c497dd8 commit 48bec2d

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

src/genai/services/async_generator.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,17 @@ async def _task(self, inputs, batch_num):
160160
async def _schedule_requests(self):
161161
local_concurrency_limit = max(self._max_concurrency_limit or math.inf, 1)
162162

163+
is_tokenize_request = self.fn == "tokenize"
164+
if not is_tokenize_request:
165+
max_generate_capacity = self.service.generate_limits().tokenCapacity
166+
local_concurrency_limit = min(local_concurrency_limit, max_generate_capacity)
167+
163168
async def get_limits():
164169
nonlocal local_concurrency_limit
165170
if local_concurrency_limit <= 0:
166171
return local_concurrency_limit
167172

168-
if self.fn == "tokenize":
173+
if is_tokenize_request:
169174
return min(local_concurrency_limit, len(self.prompts))
170175

171176
limits = self.service.generate_limits()

0 commit comments

Comments
 (0)