add codec selection + logic, add valid codecs test

Dan-Flores · Dan-Flores · commit 976bd2c1342c · 2025-11-12T14:28:59.000-05:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -649,9 +649,30 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
-  const AVCodec* avCodec =
-      avcodec_find_encoder(avFormatContext_->oformat->video_codec);
-  TORCH_CHECK(avCodec != nullptr, "Video codec not found");
+  const AVCodec* avCodec = nullptr;
+  // If codec arg is provided, find codec using logic similar to FFmpeg:
+  // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
+  if (videoStreamOptions.codec.has_value()) {
+    const std::string& codec = videoStreamOptions.codec.value();
+    // Try to find codec by name ("libx264", "libsvtav1")
+    avCodec = avcodec_find_encoder_by_name(codec.c_str());
+    // Try to find by codec descriptor ("h264", "av1")
+    if (!avCodec) {
+      const AVCodecDescriptor* desc =
+          avcodec_descriptor_get_by_name(codec.c_str());
+      if (desc) {
+        avCodec = avcodec_find_encoder(desc->id);
+      }
+    }
+    TORCH_CHECK(
+        avCodec != nullptr,
+        "Video codec ",
+        codec,
+        " not found. Provide a codec name ('libx264', 'libx265') or a codec descriptor ('h264', 'hevc'), or do not specify a codec to use the default codec.");
+  } else {
+    avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
+    TORCH_CHECK(avCodec != nullptr, "Video codec not found");
+  }
 
   AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -45,6 +45,7 @@ struct VideoStreamOptions {
   std::string_view deviceVariant = "ffmpeg";
 
   // Encoding options
+  std::optional<std::string> codec;
   // TODO-VideoEncoder: Consider adding other optional fields here
   // (bit rate, gop size, max b frames, preset)
   std::optional<int> crf;
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, int? crf=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, int? crf=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, int? crf=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, int? crf=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, int? crf=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, int? crf=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -603,9 +603,11 @@ void encode_video_to_file(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view file_name,
+    std::optional<std::string> codec = std::nullopt,
     std::optional<std::string> pixel_format = std::nullopt,
     std::optional<int64_t> crf = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   VideoEncoder(
@@ -620,10 +622,12 @@ at::Tensor encode_video_to_tensor(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view format,
+    std::optional<std::string> codec = std::nullopt,
     std::optional<std::string> pixel_format = std::nullopt,
     std::optional<int64_t> crf = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
   return VideoEncoder(
@@ -640,6 +644,7 @@ void _encode_video_to_file_like(
     int64_t frame_rate,
     std::string_view format,
     int64_t file_like_context,
+    std::optional<std::string> codec = std::nullopt,
     std::optional<std::string> pixel_format = std::nullopt,
     std::optional<int64_t> crf = std::nullopt) {
   auto fileLikeContext =
@@ -649,6 +654,7 @@ void _encode_video_to_file_like(
   std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
 
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.codec = codec;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
 
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -213,6 +213,7 @@ def encode_video_to_file_like(
     frame_rate: int,
     format: str,
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
+    codec: Optional[str] = None,
     crf: Optional[int] = None,
     pixel_format: Optional[str] = None,
 ) -> None:
@@ -223,6 +224,7 @@ def encode_video_to_file_like(
         frame_rate: Frame rate in frames per second
         format: Video format (e.g., "mp4", "mov", "mkv")
         file_like: File-like object that supports write() and seek() methods
+        codec: Optional codec name (e.g., "libx264", "h264")
         crf: Optional constant rate factor for encoding quality
         pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
     """
@@ -233,6 +235,7 @@ def encode_video_to_file_like(
         frame_rate,
         format,
         _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
+        codec,
         pixel_format,
         crf,
     )
@@ -322,8 +325,9 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     filename: str,
-    crf: Optional[int] = None,
+    codec: Optional[str],
     pixel_format: Optional[str] = None,
+    crf: Optional[int] = None,
 ) -> None:
     return
 
@@ -333,6 +337,7 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     format: str,
+    codec: Optional[str],
     crf: Optional[int] = None,
     pixel_format: Optional[str] = None,
 ) -> torch.Tensor:
@@ -345,8 +350,9 @@ def _encode_video_to_file_like_abstract(
     frame_rate: int,
     format: str,
     file_like_context: int,
-    crf: Optional[int] = None,
+    codec: Optional[str] = None,
     pixel_format: Optional[str] = None,
+    crf: Optional[int] = None,
 ) -> None:
     return
 
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
@@ -36,6 +36,7 @@ def to_file(
         self,
         dest: Union[str, Path],
         *,
+        codec: Optional[str] = None,
         pixel_format: Optional[str] = None,
     ) -> None:
         """Encode frames into a file.
@@ -44,37 +45,46 @@ def to_file(
             dest (str or ``pathlib.Path``): The path to the output file, e.g.
                 ``video.mp4``. The extension of the file determines the video
                 container format.
+            codec (str, optional): The codec to use for encoding (e.g., "libx264",
+                "h264"). If not specified, the default codec
+                for the container format will be used.
             pixel_format (str, optional): The pixel format for encoding (e.g.,
                 "yuv420p", "yuv444p"). If not specified, uses codec's default format.
         """
         _core.encode_video_to_file(
             frames=self._frames,
             frame_rate=self._frame_rate,
             filename=str(dest),
+            codec=codec,
             pixel_format=pixel_format,
         )
 
     def to_tensor(
         self,
         format: str,
         *,
+        codec: Optional[str] = None,
         pixel_format: Optional[str] = None,
     ) -> Tensor:
         """Encode frames into raw bytes, as a 1D uint8 Tensor.
 
         Args:
             format (str): The container format of the encoded frames, e.g. "mp4", "mov",
-            "mkv", "avi", "webm", "flv", or "gif"
+                    "mkv", "avi", "webm", "flv", etc.
+            codec (str, optional): The codec to use for encoding (e.g., "libx264",
+                "h264"). If not specified, the default codec
+                for the container format will be used.
             pixel_format (str, optional): The pixel format to encode frames into (e.g.,
                 "yuv420p", "yuv444p"). If not specified, uses codec's default format.
 
         Returns:
-            Tensor: The raw encoded bytes as 4D uint8 Tensor.
+            Tensor: The raw encoded bytes as 1D uint8 Tensor.
         """
         return _core.encode_video_to_tensor(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
+            codec=codec,
             pixel_format=pixel_format,
         )
 
@@ -83,6 +93,7 @@ def to_file_like(
         file_like,
         format: str,
         *,
+        codec: Optional[str] = None,
         pixel_format: Optional[str] = None,
     ) -> None:
         """Encode frames into a file-like object.
@@ -94,7 +105,10 @@ def to_file_like(
                 ``write(data: bytes) -> int`` and ``seek(offset: int, whence:
                 int = 0) -> int``.
             format (str): The container format of the encoded frames, e.g. "mp4", "mov",
-                "mkv", "avi", "webm", "flv", or "gif".
+                "mkv", "avi", "webm", "flv", etc.
+            codec (str, optional): The codec to use for encoding (e.g., "libx264",
+                "h264"). If not specified, the default codec
+                for the container format will be used.
             pixel_format (str, optional): The pixel format for encoding (e.g.,
                 "yuv420p", "yuv444p"). If not specified, uses codec's default format.
         """
@@ -103,5 +117,6 @@ def to_file_like(
             frame_rate=self._frame_rate,
             format=format,
             file_like=file_like,
+            codec=codec,
             pixel_format=pixel_format,
         )
diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -605,6 +605,12 @@ def test_bad_input_parameterized(self, tmp_path, method):
             )
             getattr(encoder, method)(**valid_params)
 
+        with pytest.raises(
+            RuntimeError,
+            match=r"Video codec invalid_codec_name not found.",
+        ):
+            encoder.to_file(str(tmp_path / "output.mp4"), codec="invalid_codec_name")
+
     def test_bad_input(self, tmp_path):
         encoder = VideoEncoder(
             frames=torch.zeros((5, 3, 64, 64), dtype=torch.uint8),
@@ -629,6 +635,24 @@ def test_bad_input(self, tmp_path):
         ):
             encoder.to_tensor(format="bad_format")
 
+    @pytest.mark.parametrize("method", ["to_file", "to_tensor", "to_file_like"])
+    @pytest.mark.parametrize("codec", ["h264", "hevc", "av1", "libx264", None])
+    def test_codec_valid_values(self, method, codec, tmp_path):
+        if method == "to_file":
+            valid_params = {"dest": str(tmp_path / "test.mp4")}
+        elif method == "to_tensor":
+            valid_params = {"format": "mp4"}
+        elif method == "to_file_like":
+            valid_params = dict(file_like=io.BytesIO(), format="mp4")
+        else:
+            raise ValueError(f"Unknown method: {method}")
+
+        encoder = VideoEncoder(
+            frames=torch.zeros((5, 3, 128, 128), dtype=torch.uint8),
+            frame_rate=30,
+        )
+        getattr(encoder, method)(**valid_params, codec=codec)
+
     @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
     def test_pixel_format_errors(self, method, tmp_path):
         frames = torch.zeros((5, 3, 64, 64), dtype=torch.uint8)