chore: Update client.py & backend.py

bradhilton · bradhilton · commit 4d61c405b577 · 2025-10-03T20:23:30.000Z
diff --git a/src/art/client.py b/src/art/client.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import os
-from typing import AsyncIterator, Iterable, Literal, TypedDict, cast
+from typing import Any, AsyncIterator, Iterable, Literal, TypedDict, cast
 
 import httpx
 from openai._base_client import AsyncAPIClient, AsyncPaginator, make_request_options
@@ -38,7 +38,6 @@ class DeleteCheckpointsResponse(BaseModel):
     not_found_steps: list[int]
 
 
-
 class LogResponse(BaseModel):
     success: bool
 
@@ -247,11 +246,12 @@ async def train(trajectory_groups: list[TrajectoryGroup]) -> None:
 
 
 class ExperimentalTrainingConfig(TypedDict, total=False):
-    learning_rate: float
+    learning_rate: float | None
+    precalculate_logprobs: bool | None
 
 
 class TrainingJob(BaseModel):
-    id: int
+    id: str
     status: str
     experimental_config: ExperimentalTrainingConfig
 
@@ -284,6 +284,46 @@ async def retrieve(self, training_job_id: int) -> TrainingJob:
             cast_to=TrainingJob,
         )
 
+    @cached_property
+    def events(self) -> TrainingJobEvents:
+        return TrainingJobEvents(cast(AsyncOpenAI, self._client))
+
+
+class TrainingJobEvent(BaseModel):
+    id: str
+    type: Literal["training_started", "gradient_step", "training_ended"]
+    data: dict[str, Any]
+
+
+class TrainingJobEventListParams(TypedDict, total=False):
+    after: str
+    limit: int
+
+
+class TrainingJobEvents(AsyncAPIResource):
+    def list(
+        self,
+        *,
+        training_job_id: str,
+        after: str | NotGiven = NOT_GIVEN,
+        limit: int | NotGiven = NOT_GIVEN,
+    ) -> AsyncPaginator[TrainingJobEvent, AsyncCursorPage[TrainingJobEvent]]:
+        return self._get_api_list(
+            f"/preview/training-jobs/{training_job_id}/events",
+            page=AsyncCursorPage[TrainingJobEvent],
+            options=make_request_options(
+                query=maybe_transform(
+                    {
+                        "after": after,
+                        "limit": limit,
+                        "training_job_id": training_job_id,
+                    },
+                    TrainingJobEventListParams,
+                ),
+            ),
+            model=TrainingJobEvent,
+        )
+
 
 class Client(AsyncAPIClient):
     api_key: str
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
@@ -1,8 +1,10 @@
 import asyncio
-from typing import TYPE_CHECKING, AsyncIterator, Literal, cast
-import os
+from typing import TYPE_CHECKING, AsyncIterator, Literal
 
-from art.client import Client
+from openai._types import NOT_GIVEN
+from tqdm import auto as tqdm
+
+from art.client import Client, ExperimentalTrainingConfig
 from art.utils.deploy_model import LoRADeploymentJob, LoRADeploymentProvider
 
 from .. import dev
@@ -57,7 +59,6 @@ def _model_inference_name(self, model: "TrainableModel") -> str:
         assert model.entity is not None, "Model entity is required"
         return f"{model.entity}/{model.project}/{model.name}"
 
-
     async def _get_step(self, model: "Model") -> int:
         if model.trainable:
             assert model.id is not None, "Model ID is required"
@@ -75,6 +76,7 @@ async def _delete_checkpoints(
         benchmark_smoothing: float,
     ) -> None:
         # TODO: potentially implement benchmark smoothing
+        assert model.id is not None, "Model ID is required"
         max_metric: float | None = None
         max_step: int | None = None
         all_steps: list[int] = []
@@ -110,11 +112,12 @@ async def _log(
             print(f"Model {model.name} is not trainable; skipping logging.")
             return
 
+        assert model.id is not None, "Model ID is required"
+
         await self._client.checkpoints.log_trajectories(
             model_id=model.id, trajectory_groups=trajectory_groups, split=split
         )
 
-
     async def _train_model(
         self,
         model: "TrainableModel",
@@ -124,15 +127,36 @@ async def _train_model(
         verbose: bool = False,
     ) -> AsyncIterator[dict[str, float]]:
         assert model.id is not None, "Model ID is required"
+
         training_job = await self._client.training_jobs.create(
             model_id=model.id,
             trajectory_groups=trajectory_groups,
-            experimental_config=dict(learning_rate=config.learning_rate),
+            experimental_config=ExperimentalTrainingConfig(
+                learning_rate=config.learning_rate,
+                precalculate_logprobs=dev_config.get("precalculate_logprobs", None),
+            ),
         )
-        while training_job.status != "COMPLETED":
-            await asyncio.sleep(1)
-            training_job = await self._client.training_jobs.retrieve(training_job.id)
-            yield {"num_gradient_steps": 1}
+        after: str | None = None
+        num_gradient_steps: int | None = None
+        pbar: tqdm.tqdm | None = None
+        while True:
+            await asyncio.sleep(0.5)
+            async for event in self._client.training_jobs.events.list(
+                training_job_id=training_job.id, after=after or NOT_GIVEN
+            ):
+                if event.type == "gradient_step":
+                    assert pbar is not None and num_gradient_steps is not None
+                    pbar.update(1)
+                    pbar.set_postfix(event.data)
+                    yield {**event.data, "num_gradient_steps": num_gradient_steps}
+                elif event.type == "training_started":
+                    num_gradient_steps = event.data["num_gradient_steps"]
+                    if pbar is None:
+                        pbar = tqdm.tqdm(total=num_gradient_steps, desc="train")
+                    continue
+                elif event.type == "training_ended":
+                    return
+                after = event.id
 
     # ------------------------------------------------------------------
     # Experimental support for S3