Better importing, better docs

scotts · scotts · commit 70b5976012b7 · 2025-11-13T12:37:05.000-08:00
diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
@@ -1,8 +1,8 @@
-.. _samplers:
+.. _transforms:
 
-===================
+=====================
 torchcodec.transforms
-===================
+=====================
 
 .. currentmodule:: torchcodec.transforms
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -209,6 +209,7 @@ def __call__(self, filename):
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "torch": ("https://pytorch.org/docs/stable/", None),
+    "torchvision": ("https://docs.pytorch.org/vision/stable/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "PIL": ("https://pillow.readthedocs.io/en/stable/", None),
     "matplotlib": ("https://matplotlib.org/stable/", None),
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -70,10 +70,8 @@ class VideoDecoder:
         transforms (sequence of transform objects, optional): Sequence of transforms to be
             applied to the decoded frames by the decoder itself, in order. Accepts both
             :class:`~torchcodec.transforms.DecoderTransform` and
-            `torchvision.transforms.v2.Transform <https://docs.pytorch.org/vision/stable/transforms.html#v2-api-reference-recommended>`_
-            objects. All transforms are applied
-            in the ouput pixel format and colorspace. Read more about this parameter in:
-            TODO_DECODER_TRANSFORMS_TUTORIAL.
+            :class:`~torchvision.transforms.v2.Transform`
+            objects. Read more about this parameter in: TODO_DECODER_TRANSFORMS_TUTORIAL.
         custom_frame_mappings (str, bytes, or file-like object, optional):
             Mapping of frames to their metadata, typically generated via ffprobe.
             This enables accurate frame seeking without requiring a full video scan.
@@ -477,7 +475,7 @@ def _convert_to_decoder_transforms(
                     " DecoderTransform. TorchCodec also accept TorchVision "
                     "v2 transforms, but TorchVision is not installed."
                 )
-            if isinstance(transform, v2.Resize):
+            elif isinstance(transform, v2.Resize):
                 converted_transforms.append(Resize._from_torchvision(transform))
             else:
                 raise ValueError(
@@ -513,7 +511,7 @@ def _make_transform_specs(
         return ""
 
     transforms = _convert_to_decoder_transforms(transforms)
-    return ";".join([t._make_params() for t in transforms])
+    return ";".join([t._make_transform_spec() for t in transforms])
 
 
 def _read_custom_frame_mappings(
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
@@ -6,6 +6,7 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from types import ModuleType
 from typing import Sequence
 
 from torch import nn
@@ -20,34 +21,41 @@ class DecoderTransform(ABC):
     should be both faster and more memory efficient than receiving normally
     decoded frames and applying the same kind of transform.
 
-    Most `DecoderTransform` objects have a complementary transform in TorchVision,
-    specificially in
-    `torchvision.transforms.v2 <https://docs.pytorch.org/vision/stable/transforms.html#v2-api-reference-recommended>`_.
-    For such transforms, we ensure that:
+    Most ``DecoderTransform`` objects have a complementary transform in TorchVision,
+    specificially in `torchvision.transforms.v2 <https://docs.pytorch.org/vision/stable/transforms.html>`_. For such transforms, we
+    ensure that:
 
       1. The names are the same.
       2. Default behaviors are the same.
-      3. The parameters for the `DecoderTransform` object are a subset of the
-         TorchVision transform object.
+      3. The parameters for the ``DecoderTransform`` object are a subset of the
+         TorchVision :class:`~torchvision.transforms.v2.Transform` object.
       4. Parameters with the same name control the same behavior and accept a
          subset of the same types.
       5. The difference between the frames returned by a decoder transform and
-         the complementary TorchVision transform are small.
-
-    All decoder transforms are applied in the output pixel format and colorspace.
+         the complementary TorchVision transform are such that a model should
+         not be able to tell the difference.
     """
 
     @abstractmethod
-    def _make_params(self) -> str:
+    def _make_transform_spec(self) -> str:
         pass
 
 
+def import_torchvision_transforms_v2() -> ModuleType:
+    try:
+        from torchvision.transforms import v2
+    except ImportError as e:
+        raise RuntimeError(
+            "Cannot import TorchVision; this should never happen, please report a bug."
+        ) from e
+    return v2
+
+
 @dataclass
 class Resize(DecoderTransform):
     """Resize the decoded frame to a given size.
 
-    Complementary TorchVision transform:
-    `torchvision.transforms.v2.Resize <https://docs.pytorch.org/vision/stable/generated/torchvision.transforms.v2.Resize.html#torchvision.transforms.v2.Resize>`_.
+    Complementary TorchVision transform: :class:`~torchvision.transforms.v2.Resize`.
     Interpolation is always bilinear. Anti-aliasing is always on.
 
     Args:
@@ -57,13 +65,13 @@ class Resize(DecoderTransform):
 
     size: Sequence[int]
 
-    def _make_params(self) -> str:
+    def _make_transform_spec(self) -> str:
         assert len(self.size) == 2
         return f"resize, {self.size[0]}, {self.size[1]}"
 
     @classmethod
     def _from_torchvision(cls, resize_tv: nn.Module):
-        from torchvision.transforms import v2
+        v2 = import_torchvision_transforms_v2()
 
         assert isinstance(resize_tv, v2.Resize)