Better names, better docs

scotts · scotts · commit 0d2492e9dd89 · 2025-11-07T18:40:47.000-08:00
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -8,7 +8,7 @@
 import json
 import numbers
 from pathlib import Path
-from typing import Literal, Optional, Sequence, Tuple, Union
+from typing import List, Literal, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import device as torch_device, Tensor
@@ -19,7 +19,7 @@
     create_decoder,
     ERROR_REPORTING_INSTRUCTIONS,
 )
-from torchcodec.transforms import DecoderNativeTransform, Resize
+from torchcodec.transforms import DecoderTransform, Resize
 
 
 class VideoDecoder:
@@ -67,6 +67,10 @@ class VideoDecoder:
             probably is. Default: "exact".
             Read more about this parameter in:
             :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
+        transforms (sequence of transform objects, optional): Sequence of transforms to be
+            applied to the decoded frames by the decoder itself, in order. Accepts both
+            torchcodec.transforms.DecoderTransform and torchvision.transforms.v2.Transform
+            objects. All transforms are applied in the ouput pixel format and colorspace.
         custom_frame_mappings (str, bytes, or file-like object, optional):
             Mapping of frames to their metadata, typically generated via ffprobe.
             This enables accurate frame seeking without requiring a full video scan.
@@ -104,8 +108,8 @@ def __init__(
         dimension_order: Literal["NCHW", "NHWC"] = "NCHW",
         num_ffmpeg_threads: int = 1,
         device: Optional[Union[str, torch_device]] = "cpu",
-        transforms: Optional[Sequence[DecoderNativeTransform]] = None,
         seek_mode: Literal["exact", "approximate"] = "exact",
+        transforms: Optional[Sequence[DecoderTransform]] = None,
         custom_frame_mappings: Optional[
             Union[str, bytes, io.RawIOBase, io.BufferedReader]
         ] = None,
@@ -437,15 +441,23 @@ def _get_and_validate_stream_metadata(
     )
 
 
-# This function, _make_transform_specs, and the transforms argument to
-# VideoDecoder actually accept a union of DecoderNativeTransform and
-# TorchVision transforms. We don't put that in our type annotation because
-# that would require importing torchvision at module scope which would mean we
-# have a hard dependency on torchvision.
-# TODO: better explanation of the above.
 def _convert_to_decoder_native_transforms(
-    transforms: Sequence[DecoderNativeTransform],
-) -> Sequence[DecoderNativeTransform]:
+    transforms: Sequence[DecoderTransform],
+) -> List[DecoderTransform]:
+    """Convert a sequence of transforms that may contain TorchVision transform
+    objects into a list of only TorchCodec transform objects.
+
+    Args:
+        transforms: Squence of transform objects. The objects can be one of two
+        types:
+                1. torchcodec.transforms.DecoderTransform
+                2. torchvision.transforms.v2.Transform
+            Our type annotation only mentions the first type so that we don't
+            have a hard dependency on TorchVision.
+
+    Returns:
+        List of DecoderTransform objects.
+    """
     try:
         from torchvision.transforms import v2
 
@@ -455,11 +467,11 @@ def _convert_to_decoder_native_transforms(
 
     converted_transforms = []
     for transform in transforms:
-        if not isinstance(transform, DecoderNativeTransform):
+        if not isinstance(transform, DecoderTransform):
             if not tv_available:
                 raise ValueError(
                     f"The supplied transform, {transform}, is not a TorchCodec "
-                    " DecoderNativeTransform. TorchCodec also accept TorchVision "
+                    " DecoderTransform. TorchCodec also accept TorchVision "
                     "v2 transforms, but TorchVision is not installed."
                 )
             if isinstance(transform, v2.Resize):
@@ -472,7 +484,7 @@ def _convert_to_decoder_native_transforms(
             else:
                 raise ValueError(
                     f"Unsupported transform: {transform}. Transforms must be "
-                    "either a TorchCodec DecoderNativeTransform or a TorchVision "
+                    "either a TorchCodec DecoderTransform or a TorchVision "
                     "v2 transform."
                 )
         else:
@@ -482,8 +494,23 @@ def _convert_to_decoder_native_transforms(
 
 
 def _make_transform_specs(
-    transforms: Optional[Sequence[DecoderNativeTransform]],
+    transforms: Optional[Sequence[DecoderTransform]],
 ) -> str:
+    """Given a sequence of transforms, turn those into the specification string
+       the core API expects.
+
+    Args:
+        transforms: Optional sequence of transform objects. The objects can be
+            one of two types:
+                1. torchcodec.transforms.DecoderTransform
+                2. torchvision.transforms.v2.Transform
+            Our type annotation only mentions the first type so that we don't
+            have a hard dependency on TorchVision.
+
+    Returns:
+        String of transforms in the format the core API expects: transform
+        specifications separate by semicolons.
+    """
     if transforms is None:
         return ""
 
diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._decoder_native_transforms import DecoderNativeTransform, Resize  # noqa
+from ._decoder_transforms import DecoderTransform, Resize  # noqa
diff --git a/src/torchcodec/transforms/_decoder_native_transforms.py b/src/torchcodec/transforms/_decoder_native_transforms.py
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Sequence
+
+
+@dataclass
+class DecoderTransform(ABC):
+    """Base class for all decoder transforms.
+
+    A DecoderTransform is a transform that is applied by the decoder before
+    returning the decoded frame. The implementation does not live in TorchCodec
+    itself, but in the underyling decoder. Applying DecoderTransforms to frames
+    should be both faster and more memory efficient than receiving normally
+    decoded frames and applying the same kind of transform.
+
+    Most DecoderTransforms have a complementary transform in TorchVision,
+    specificially in torchvision.transforms.v2. For such transforms, we ensure
+    that:
+
+      1. Default behaviors are the same.
+      2. The parameters for the DecoderTransform are a subset of the
+         TorchVision transform.
+      3. Parameters with the same name control the same behavior and accept a
+         subset of the same types.
+      4. The difference between the frames returned by a DecoderTransform and
+         the complementary TorchVision transform are small.
+
+    All DecoderTranforms are applied in the output pixel format and colorspace.
+    """
+
+    @abstractmethod
+    def make_params(self) -> str:
+        pass
+
+
+@dataclass
+class Resize(DecoderTransform):
+    """Resize the decoded frame to a given size.
+
+    Complementary TorchVision transform: torchvision.transforms.v2.Resize.
+    Interpolation is always bilinear. Anti-aliasing is always on.
+
+    Args:
+        size: (sequence of int): Desired output size. Must be a sequence of
+            the form (height, width).
+    """
+
+    size: Sequence[int]
+
+    def make_params(self) -> str:
+        assert len(self.size) == 2
+        return f"resize, {self.size[0]}, {self.size[1]}"