Changes

scotts · scotts · commit 2cd3f6597976 · 2025-11-11T06:51:22.000-08:00
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -11,7 +11,7 @@
 from typing import List, Literal, Optional, Sequence, Tuple, Union
 
 import torch
-from torch import device as torch_device, Tensor
+from torch import device as torch_device, nn, Tensor
 
 from torchcodec import _core as core, Frame, FrameBatch
 from torchcodec.decoders._decoder_utils import (
@@ -69,8 +69,10 @@ class VideoDecoder:
             :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
         transforms (sequence of transform objects, optional): Sequence of transforms to be
             applied to the decoded frames by the decoder itself, in order. Accepts both
-            ``torchcodec.transforms.DecoderTransform`` and ``torchvision.transforms.v2.Transform``
-            objects. All transforms are applied in the ouput pixel format and colorspace.
+            :class:`torchcodec.transforms.DecoderTransform` and
+            :class:`torchvision.transforms.v2.Transform` objects. All transforms are applied
+            in the ouput pixel format and colorspace. Read more about this parameter in:
+            SCOTT_NEEDS_TO_WRITE_A_TUTORIAL.
         custom_frame_mappings (str, bytes, or file-like object, optional):
             Mapping of frames to their metadata, typically generated via ffprobe.
             This enables accurate frame seeking without requiring a full video scan.
@@ -109,7 +111,7 @@ def __init__(
         num_ffmpeg_threads: int = 1,
         device: Optional[Union[str, torch_device]] = "cpu",
         seek_mode: Literal["exact", "approximate"] = "exact",
-        transforms: Optional[Sequence[DecoderTransform]] = None,
+        transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]] = None,
         custom_frame_mappings: Optional[
             Union[str, bytes, io.RawIOBase, io.BufferedReader]
         ] = None,
@@ -442,7 +444,7 @@ def _get_and_validate_stream_metadata(
 
 
 def _convert_to_decoder_native_transforms(
-    transforms: Sequence[DecoderTransform],
+    transforms: Sequence[Union[DecoderTransform, nn.Module]],
 ) -> List[DecoderTransform]:
     """Convert a sequence of transforms that may contain TorchVision transform
     objects into a list of only TorchCodec transform objects.
@@ -494,7 +496,7 @@ def _convert_to_decoder_native_transforms(
 
 
 def _make_transform_specs(
-    transforms: Optional[Sequence[DecoderTransform]],
+    transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
 ) -> str:
     """Given a sequence of transforms, turn those into the specification string
        the core API expects.
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -14,21 +14,21 @@ class DecoderTransform(ABC):
     """Base class for all decoder transforms.
 
     A DecoderTransform is a transform that is applied by the decoder before
-    returning the decoded frame. The implementation does not live in TorchCodec
-    itself, but in the underyling decoder. Applying DecoderTransforms to frames
+    returning the decoded frame.  Applying DecoderTransforms to frames
     should be both faster and more memory efficient than receiving normally
     decoded frames and applying the same kind of transform.
 
     Most DecoderTransforms have a complementary transform in TorchVision,
     specificially in torchvision.transforms.v2. For such transforms, we ensure
     that:
 
-      1. Default behaviors are the same.
-      2. The parameters for the DecoderTransform are a subset of the
+      1. The names are the same.
+      2. Default behaviors are the same.
+      3. The parameters for the DecoderTransform are a subset of the
          TorchVision transform.
-      3. Parameters with the same name control the same behavior and accept a
+      4. Parameters with the same name control the same behavior and accept a
          subset of the same types.
-      4. The difference between the frames returned by a DecoderTransform and
+      5. The difference between the frames returned by a DecoderTransform and
          the complementary TorchVision transform are small.
 
     All DecoderTranforms are applied in the output pixel format and colorspace.