feat: Implement MathVista training script for image-based question answering

bradhilton · bradhilton · commit 585f903ddd95 · 2025-10-17T01:58:44.000Z
* Added a new script to train a model using image and question pairs from the MathVista dataset.
* Integrated asynchronous processing for efficient training and trajectory logging.
* Enhanced image handling by saving decoded images to a temporary directory for model input.
* Improved argument parsing for customizable training runs.
diff --git a/dev/math-vista/math-vista.py b/dev/math-vista/math-vista.py
@@ -0,0 +1,136 @@
+import argparse
+import asyncio
+import itertools
+import os
+import re
+from typing import Iterator, TypedDict, cast
+
+import polars as pl
+
+import art
+from art.local import LocalBackend
+
+
+class DecodedImage(TypedDict):
+    bytes: bytes
+
+
+class Scenario(TypedDict):
+    pid: int
+    question: str
+    answer: str
+    image: str
+    decoded_image: DecodedImage
+
+
+async def main(model_name: str, steps: int) -> None:
+    # Load and shuffle the dataset
+    df = pl.read_parquet(
+        "hf://datasets/AI4Math/MathVista/data/testmini-00000-of-00001-725687bf7a18d64b.parquet"
+    ).sample(fraction=1.0, shuffle=True, seed=42)
+
+    val_scenarios = cast(list[Scenario], df.head(64).to_dicts())
+    train_scenarios_iter = cast(Iterator[Scenario], df.tail(-64).iter_rows(named=True))
+
+    # Initialize trainable model and backend
+    model = art.TrainableModel(
+        name=model_name,
+        project="math-vista",
+        base_model="Qwen/Qwen2.5-VL-7B-Instruct",
+    )
+
+    async def rollout(scenario: Scenario) -> art.Trajectory:
+        image_path = f"/tmp/{scenario['image']}"
+        os.makedirs(os.path.dirname(image_path), exist_ok=True)
+        with open(image_path, "wb") as f:
+            f.write(scenario["decoded_image"]["bytes"])
+
+        trajectory = art.Trajectory(messages_and_choices=[], reward=0.0)
+        trajectory.messages_and_choices = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": scenario["question"]
+                        + "\n\nNote: Provide your answer in a LaTeX box.",
+                    },
+                    {"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
+                ],
+            }
+        ]
+
+        chat_completion = await client.chat.completions.create(
+            model=model.name, messages=trajectory.messages()
+        )
+        choice = chat_completion.choices[0]
+        trajectory.messages_and_choices.append(choice)
+        content = choice.message.content
+        assert content is not None
+
+        if matches := list(re.finditer(r"\\boxed\{(.*?)\}", content, re.DOTALL)):
+            match = matches[-1]
+            answer = match.group(1)
+            if answer.lower() == scenario["answer"].lower():
+                trajectory.reward = 1.0
+        return trajectory
+
+    SCENARIOS_PER_STEP = 8
+    TRAJECTORY_GROUP_SIZE = 8
+
+    with LocalBackend() as backend:
+        await model.register(backend)
+        client = model.openai_client()
+
+        start = await model.get_step()
+        train_scenarios_iter = itertools.cycle(train_scenarios_iter)
+        for _ in range(start * SCENARIOS_PER_STEP):
+            next(train_scenarios_iter)
+
+        # Training loop
+        for _ in range(start, steps):
+            train_scenarios = [
+                next(train_scenarios_iter) for _ in range(SCENARIOS_PER_STEP)
+            ]
+            val_trajectories, train_trajectory_groups = await asyncio.gather(
+                art.gather_trajectories(
+                    (rollout(scenario) for scenario in val_scenarios),
+                    pbar_desc="gather(val)",
+                    max_exceptions=32,
+                ),
+                art.gather_trajectory_groups(
+                    (
+                        art.TrajectoryGroup(
+                            rollout(scenario) for _ in range(TRAJECTORY_GROUP_SIZE)
+                        )
+                        for scenario in train_scenarios
+                    ),
+                    pbar_desc="gather(train)",
+                    max_exceptions=32,
+                ),
+            )
+            await model.log(val_trajectories)
+            await model.train(train_trajectory_groups)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Minimal MathVista trainer script")
+    parser.add_argument(
+        "-n",
+        "--name",
+        required=True,
+        help="Run/model name to use for the TrainableModel",
+    )
+    parser.add_argument(
+        "-s",
+        "--steps",
+        type=int,
+        default=1000,
+        help="Number of training steps to run",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    asyncio.run(main(args.name, args.steps))
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -498,9 +498,9 @@ async def _train_model(
             num_gradient_steps = int(
                 result.pop("num_gradient_steps", estimated_gradient_steps)
             )
-            assert (
-                num_gradient_steps == estimated_gradient_steps
-            ), f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
+            assert num_gradient_steps == estimated_gradient_steps, (
+                f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
+            )
             results.append(result)
             yield {**result, "num_gradient_steps": num_gradient_steps}
             pbar.update(1)
diff --git a/src/art/preprocessing/tokenize.py b/src/art/preprocessing/tokenize.py
@@ -202,9 +202,9 @@ def tokenize_trajectory(
             assistant_mask[start:end] = [1] * len(content_token_ids)
         else:
             choice = message
-            assert (
-                choice.logprobs or allow_training_without_logprobs
-            ), "Chat completion choices must have logprobs"
+            assert choice.logprobs or allow_training_without_logprobs, (
+                "Chat completion choices must have logprobs"
+            )
             if not choice.logprobs:
                 continue
             token_logprobs = choice.logprobs.content or choice.logprobs.refusal or []
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
@@ -185,9 +185,9 @@ async def train(
                     for task in done:
                         result = task.result()
                         # If `result` is `None`, the training task finished somehow.
-                        assert (
-                            result is not None
-                        ), "The training task should never finish."
+                        assert result is not None, (
+                            "The training task should never finish."
+                        )
                         self.results_queue.task_done()
                         if warmup:
                             from .state import gc_and_empty_cuda_cache
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
@@ -114,9 +114,9 @@ def compute_loss(
         next_input_ids = shift_tensor(inputs["tokens"], 0)
         chunk_size = _config.get("logprob_calculation_chunk_size", 1024)
         # Assert that sequence length is evenly divisible by the chunk size
-        assert (
-            seq_len % chunk_size == 0
-        ), f"Sequence length ({seq_len}) must be evenly divisible by chunk size ({chunk_size})"
+        assert seq_len % chunk_size == 0, (
+            f"Sequence length ({seq_len}) must be evenly divisible by chunk size ({chunk_size})"
+        )
         os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
         forward_kwargs = {}
         if "pixel_values" in inputs:
@@ -371,9 +371,7 @@ def _calculate_logprobs(
         chunk_logits = torch.matmul(chunk_hs, lm_head_t)  # [B, chunk_size, V]
         chunk_selected_logits = torch.gather(
             chunk_logits, dim=-1, index=chunk_input_ids.unsqueeze(-1)
-        ).squeeze(
-            -1
-        )  # [B, chunk_size]
+        ).squeeze(-1)  # [B, chunk_size]
         chunk_logsumexp = torch.logsumexp(chunk_logits, dim=-1)  # [B, chunk_size]
         log_probs[:, i : i + chunk_size] = chunk_selected_logits - chunk_logsumexp
 
diff --git a/src/art/utils/trajectory_logging.py b/src/art/utils/trajectory_logging.py
@@ -1,5 +1,5 @@
 import json
-from typing import Any, cast, Iterator
+from typing import Any, Iterator, cast
 
 import yaml