fix: update maximal local concurrency limit based on API response (#242)

Tomas2D · web-flow · commit 48bec2d03f7b · 2023-12-08T17:30:41.000+01:00
diff --git a/src/genai/services/async_generator.py b/src/genai/services/async_generator.py
@@ -160,12 +160,17 @@ async def _task(self, inputs, batch_num):
     async def _schedule_requests(self):
         local_concurrency_limit = max(self._max_concurrency_limit or math.inf, 1)
 
+        is_tokenize_request = self.fn == "tokenize"
+        if not is_tokenize_request:
+            max_generate_capacity = self.service.generate_limits().tokenCapacity
+            local_concurrency_limit = min(local_concurrency_limit, max_generate_capacity)
+
         async def get_limits():
             nonlocal local_concurrency_limit
             if local_concurrency_limit <= 0:
                 return local_concurrency_limit
 
-            if self.fn == "tokenize":
+            if is_tokenize_request:
                 return min(local_concurrency_limit, len(self.prompts))
 
             limits = self.service.generate_limits()