Merge branch 'main' of https://github.com/meta-pytorch/torchcodec into codec_options_encode_option

Dan-Flores · Dan-Flores · commit 5629e6e82aec · 2025-11-14T10:41:45.000-05:00
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -62,7 +62,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install dependencies and FFmpeg
         run: |
-          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
           conda install "ffmpeg=7.0.1" pkg-config pybind11 -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -95,12 +95,13 @@ jobs:
           # We install conda packages at the start because otherwise conda may have conflicts with dependencies.
           # Note: xorg-libxau was addded to fix a problem with ffmpeg 4. We should consider removing it.
           default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }} conda-forge::xorg-libxau"
-      - name: Check env
+      - name: Check env, set LD_LIBRARY_PATH
         run: |
           ${CONDA_RUN} env
           ${CONDA_RUN} conda info
           ${CONDA_RUN} nvidia-smi
           ${CONDA_RUN} conda list
+          echo LD_LIBRARY_PATH=$CONDA_PREFIX/lib:/usr/local/cuda/lib64/:${LD_LIBRARY_PATH} >> $GITHUB_ENV
       - name: Assert ffmpeg exists
         run: |
           ${CONDA_RUN} ffmpeg -buildconf
diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
@@ -0,0 +1,17 @@
+.. _transforms:
+
+=====================
+torchcodec.transforms
+=====================
+
+.. currentmodule:: torchcodec.transforms
+
+For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: dataclass.rst
+
+    DecoderTransform
+    Resize
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -209,6 +209,7 @@ def __call__(self, filename):
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "torch": ("https://pytorch.org/docs/stable/", None),
+    "torchvision": ("https://docs.pytorch.org/vision/stable/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "PIL": ("https://pillow.readthedocs.io/en/stable/", None),
     "matplotlib": ("https://matplotlib.org/stable/", None),
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -125,3 +125,4 @@ Encoding
    api_ref_decoders
    api_ref_encoders
    api_ref_samplers
+   api_ref_transforms
diff --git a/mypy.ini b/mypy.ini
@@ -4,3 +4,4 @@ files = src/torchcodec
 show_error_codes = True
 pretty = True
 allow_redefinition = True
+follow_untyped_imports = True
diff --git a/src/torchcodec/__init__.py b/src/torchcodec/__init__.py
@@ -9,7 +9,7 @@
 # Note: usort wants to put Frame and FrameBatch after decoders and samplers,
 # but that results in circular import.
 from ._frame import AudioSamples, Frame, FrameBatch  # usort:skip # noqa
-from . import decoders, encoders, samplers  # noqa
+from . import decoders, encoders, samplers, transforms  # noqa
 
 try:
     # Note that version.py is generated during install.
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -724,9 +724,33 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
-  const AVCodec* avCodec =
-      avcodec_find_encoder(avFormatContext_->oformat->video_codec);
-  TORCH_CHECK(avCodec != nullptr, "Video codec not found");
+  const AVCodec* avCodec = nullptr;
+  // If codec arg is provided, find codec using logic similar to FFmpeg:
+  // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
+  if (videoStreamOptions.codec.has_value()) {
+    const std::string& codec = videoStreamOptions.codec.value();
+    // Try to find codec by name ("libx264", "libsvtav1")
+    avCodec = avcodec_find_encoder_by_name(codec.c_str());
+    // Try to find by codec descriptor ("h264", "av1")
+    if (!avCodec) {
+      const AVCodecDescriptor* desc =
+          avcodec_descriptor_get_by_name(codec.c_str());
+      if (desc) {
+        avCodec = avcodec_find_encoder(desc->id);
+      }
+    }
+    TORCH_CHECK(
+        avCodec != nullptr,
+        "Video codec ",
+        codec,
+        " not found. To see available codecs, run: ffmpeg -encoders");
+  } else {
+    TORCH_CHECK(
+        avFormatContext_->oformat != nullptr,
+        "Output format is null, unable to find default codec.");
+    avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
+    TORCH_CHECK(avCodec != nullptr, "Video codec not found");
+  }
 
   AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -46,6 +46,7 @@ struct VideoStreamOptions {
   std::string_view deviceVariant = "ffmpeg";
 
   // Encoding options
+  std::optional<std::string> codec;
   // Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
   // If not specified, uses codec's default format.
   std::optional<std::string> pixelFormat;
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -613,11 +613,13 @@ void encode_video_to_file(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view file_name,
+    std::optional<std::string> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
     std::optional<std::string_view> preset = std::nullopt,
     std::optional<std::vector<std::string>> codec_options = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   videoStreamOptions.preset = preset;
@@ -639,12 +641,14 @@ at::Tensor encode_video_to_tensor(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view format,
+    std::optional<std::string> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
     std::optional<std::string_view> preset = std::nullopt,
     std::optional<std::vector<std::string>> codec_options = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   videoStreamOptions.preset = preset;
@@ -668,6 +672,7 @@ void _encode_video_to_file_like(
     int64_t frame_rate,
     std::string_view format,
     int64_t file_like_context,
+    std::optional<std::string> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
     std::optional<std::string_view> preset = std::nullopt,
@@ -679,6 +684,7 @@ void _encode_video_to_file_like(
   std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
 
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   videoStreamOptions.preset = preset;
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -213,8 +213,9 @@ def encode_video_to_file_like(
     frame_rate: int,
     format: str,
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
-    crf: Optional[Union[int, float]] = None,
+    codec: Optional[str] = None,
     pixel_format: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
     preset: Optional[str] = None,
     codec_options: Optional[list[str]] = None,
 ) -> None:
@@ -225,8 +226,9 @@ def encode_video_to_file_like(
         frame_rate: Frame rate in frames per second
         format: Video format (e.g., "mp4", "mov", "mkv")
         file_like: File-like object that supports write() and seek() methods
-        crf: Optional constant rate factor for encoding quality
+        codec: Optional codec name (e.g., "libx264", "h264")
         pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
+        crf: Optional constant rate factor for encoding quality
         preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
         codec_options: Optional list of codec options as flattened key-value pairs
     """
@@ -237,6 +239,7 @@ def encode_video_to_file_like(
         frame_rate,
         format,
         _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
+        codec,
         pixel_format,
         crf,
         preset,
@@ -328,6 +331,7 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     filename: str,
+    codec: Optional[str],
     pixel_format: Optional[str] = None,
     preset: Optional[str] = None,
     crf: Optional[Union[int, float]] = None,
@@ -341,6 +345,7 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     format: str,
+    codec: Optional[str],
     pixel_format: Optional[str] = None,
     preset: Optional[str] = None,
     crf: Optional[Union[int, float]] = None,
@@ -355,6 +360,7 @@ def _encode_video_to_file_like_abstract(
     frame_rate: int,
     format: str,
     file_like_context: int,
+    codec: Optional[str],
     pixel_format: Optional[str] = None,
     preset: Optional[str] = None,
     crf: Optional[Union[int, float]] = None,
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -8,17 +8,18 @@
 import json
 import numbers
 from pathlib import Path
-from typing import Literal, Optional, Tuple, Union
+from typing import List, Literal, Optional, Sequence, Tuple, Union
 
 import torch
-from torch import device as torch_device, Tensor
+from torch import device as torch_device, nn, Tensor
 
 from torchcodec import _core as core, Frame, FrameBatch
 from torchcodec.decoders._decoder_utils import (
     _get_cuda_backend,
     create_decoder,
     ERROR_REPORTING_INSTRUCTIONS,
 )
+from torchcodec.transforms import DecoderTransform, Resize
 
 
 class VideoDecoder:
@@ -67,6 +68,11 @@ class VideoDecoder:
             probably is. Default: "exact".
             Read more about this parameter in:
             :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
+        transforms (sequence of transform objects, optional): Sequence of transforms to be
+            applied to the decoded frames by the decoder itself, in order. Accepts both
+            :class:`~torchcodec.transforms.DecoderTransform` and
+            :class:`~torchvision.transforms.v2.Transform`
+            objects. Read more about this parameter in: TODO_DECODER_TRANSFORMS_TUTORIAL.
         custom_frame_mappings (str, bytes, or file-like object, optional):
             Mapping of frames to their metadata, typically generated via ffprobe.
             This enables accurate frame seeking without requiring a full video scan.
@@ -105,6 +111,7 @@ def __init__(
         num_ffmpeg_threads: int = 1,
         device: Optional[Union[str, torch_device]] = None,
         seek_mode: Literal["exact", "approximate"] = "exact",
+        transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]] = None,
         custom_frame_mappings: Optional[
             Union[str, bytes, io.RawIOBase, io.BufferedReader]
         ] = None,
@@ -151,13 +158,16 @@ def __init__(
 
         device_variant = _get_cuda_backend()
 
+        transform_specs = _make_transform_specs(transforms)
+
         core.add_video_stream(
             self._decoder,
             stream_index=stream_index,
             dimension_order=dimension_order,
             num_threads=num_ffmpeg_threads,
             device=device,
             device_variant=device_variant,
+            transform_specs=transform_specs,
             custom_frame_mappings=custom_frame_mappings_data,
         )
 
@@ -435,6 +445,78 @@ def _get_and_validate_stream_metadata(
     )
 
 
+def _convert_to_decoder_transforms(
+    transforms: Sequence[Union[DecoderTransform, nn.Module]],
+) -> List[DecoderTransform]:
+    """Convert a sequence of transforms that may contain TorchVision transform
+    objects into a list of only TorchCodec transform objects.
+
+    Args:
+        transforms: Squence of transform objects. The objects can be one of two
+        types:
+                1. torchcodec.transforms.DecoderTransform
+                2. torchvision.transforms.v2.Transform, but our type annotation
+                   only mentions its base, nn.Module. We don't want to take a
+                   hard dependency on TorchVision.
+
+    Returns:
+        List of DecoderTransform objects.
+    """
+    try:
+        from torchvision.transforms import v2
+
+        tv_available = True
+    except ImportError:
+        tv_available = False
+
+    converted_transforms: list[DecoderTransform] = []
+    for transform in transforms:
+        if not isinstance(transform, DecoderTransform):
+            if not tv_available:
+                raise ValueError(
+                    f"The supplied transform, {transform}, is not a TorchCodec "
+                    " DecoderTransform. TorchCodec also accept TorchVision "
+                    "v2 transforms, but TorchVision is not installed."
+                )
+            elif isinstance(transform, v2.Resize):
+                converted_transforms.append(Resize._from_torchvision(transform))
+            else:
+                raise ValueError(
+                    f"Unsupported transform: {transform}. Transforms must be "
+                    "either a TorchCodec DecoderTransform or a TorchVision "
+                    "v2 transform."
+                )
+        else:
+            converted_transforms.append(transform)
+
+    return converted_transforms
+
+
+def _make_transform_specs(
+    transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
+) -> str:
+    """Given a sequence of transforms, turn those into the specification string
+       the core API expects.
+
+    Args:
+        transforms: Optional sequence of transform objects. The objects can be
+            one of two types:
+                1. torchcodec.transforms.DecoderTransform
+                2. torchvision.transforms.v2.Transform, but our type annotation
+                   only mentions its base, nn.Module. We don't want to take a
+                   hard dependency on TorchVision.
+
+    Returns:
+        String of transforms in the format the core API expects: transform
+        specifications separate by semicolons.
+    """
+    if transforms is None:
+        return ""
+
+    transforms = _convert_to_decoder_transforms(transforms)
+    return ";".join([t._make_transform_spec() for t in transforms])
+
+
 def _read_custom_frame_mappings(
     custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader]
 ) -> tuple[Tensor, Tensor, Tensor]:
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
diff --git a/src/torchcodec/transforms/__init__.py b/src/torchcodec/transforms/__init__.py
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
diff --git a/test/test_encoders.py b/test/test_encoders.py
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py