feedback

Dan-Flores · Dan-Flores · commit f8b64ea58297 · 2025-12-01T22:17:15.000Z
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -523,9 +523,7 @@ void AudioEncoder::flushBuffers() {
 
 namespace {
 
-torch::Tensor validateFrames(
-    const torch::Tensor& frames,
-    const torch::Device& device) {
+torch::Tensor validateFrames(const torch::Tensor& frames) {
   TORCH_CHECK(
       frames.dtype() == torch::kUInt8,
       "frames must have uint8 dtype, got ",
@@ -538,15 +536,6 @@ torch::Tensor validateFrames(
       frames.sizes()[1] == 3,
       "frame must have 3 channels (R, G, B), got ",
       frames.sizes()[1]);
-  if (device.type() != torch::kCPU) {
-    TORCH_CHECK(
-        frames.is_cuda(),
-        "When using CUDA encoding (device=",
-        device.str(),
-        "), frames must be on a CUDA device. Got frames on ",
-        frames.device().str(),
-        ". Please move frames to a CUDA device: frames.to('cuda')");
-  }
   return frames.contiguous();
 }
 
@@ -676,8 +665,7 @@ VideoEncoder::VideoEncoder(
     double frameRate,
     std::string_view fileName,
     const VideoStreamOptions& videoStreamOptions)
-    : frames_(validateFrames(frames, videoStreamOptions.device)),
-      inFrameRate_(frameRate) {
+    : frames_(validateFrames(frames)), inFrameRate_(frameRate) {
   setFFmpegLogLevel();
 
   // Allocate output format context
@@ -710,7 +698,7 @@ VideoEncoder::VideoEncoder(
     std::string_view formatName,
     std::unique_ptr<AVIOContextHolder> avioContextHolder,
     const VideoStreamOptions& videoStreamOptions)
-    : frames_(validateFrames(frames, videoStreamOptions.device)),
+    : frames_(validateFrames(frames)),
       inFrameRate_(frameRate),
       avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
@@ -736,8 +724,8 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
-  if (videoStreamOptions.device.is_cuda()) {
-    gpuEncoder_ = std::make_unique<GpuEncoder>(videoStreamOptions.device);
+  if (frames_.device().is_cuda()) {
+    gpuEncoder_ = std::make_unique<GpuEncoder>(frames_.device());
   }
 
   const AVCodec* avCodec = nullptr;
@@ -764,12 +752,7 @@ void VideoEncoder::initializeEncoder(
     TORCH_CHECK(
         avFormatContext_->oformat != nullptr,
         "Output format is null, unable to find default codec.");
-    // Try to find a hardware-accelerated encoder if not using CPU
     avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
-    if (gpuEncoder_) {
-      avCodec = gpuEncoder_->findEncoder(avFormatContext_->oformat->video_codec)
-                    .value_or(avCodec);
-    }
     TORCH_CHECK(avCodec != nullptr, "Video codec not found");
   }
 
@@ -842,11 +825,9 @@ void VideoEncoder::initializeEncoder(
         0);
   }
 
-  // Register the hardware device context with the codec
-  // context before calling avcodec_open2().
   if (gpuEncoder_) {
     gpuEncoder_->registerHardwareDeviceWithCodec(avCodecContext_.get());
-    gpuEncoder_->setupEncodingContext(avCodecContext_.get());
+    gpuEncoder_->setupHardwareFrameContext(avCodecContext_.get());
   }
 
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
diff --git a/src/torchcodec/_core/GpuEncoder.cpp b/src/torchcodec/_core/GpuEncoder.cpp
@@ -100,46 +100,32 @@ void GpuEncoder::initializeHardwareContext() {
   nppCtx_ = getNppStreamContext(device_);
 }
 
-std::optional<const AVCodec*> GpuEncoder::findEncoder(
-    const AVCodecID& codecId) {
-  void* i = nullptr;
-  const AVCodec* codec = nullptr;
-  while ((codec = av_codec_iterate(&i)) != nullptr) {
-    if (codec->id != codecId || !av_codec_is_encoder(codec)) {
-      continue;
-    }
-
-    const AVCodecHWConfig* config = nullptr;
-    for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
-         ++j) {
-      if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
-        return codec;
-      }
-    }
-  }
-  return std::nullopt;
-}
-
 void GpuEncoder::registerHardwareDeviceWithCodec(AVCodecContext* codecContext) {
   TORCH_CHECK(
       hardwareDeviceCtx_, "Hardware device context has not been initialized");
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
   codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
 }
 
-void GpuEncoder::setupEncodingContext(AVCodecContext* codecContext) {
+// Allocates and initializes AVHWFramesContext, and sets pixel format fields
+// to enable encoding with CUDA device. The hw_frames_ctx field is needed by
+// FFmpeg to allocate frames on GPU's memory.
+void GpuEncoder::setupHardwareFrameContext(AVCodecContext* codecContext) {
   TORCH_CHECK(
       hardwareDeviceCtx_, "Hardware device context has not been initialized");
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
 
-  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
-  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
-
   AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
   TORCH_CHECK(
       hwFramesCtxRef != nullptr,
       "Failed to allocate hardware frames context for codec");
 
+  // Always set pixel formats to options that support CUDA encoding.
+  // TODO-VideoEncoder: Enable user set pixel formats to be set and properly
+  // converted with npp functions below
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
+
   AVHWFramesContext* hwFramesCtx =
       reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
   hwFramesCtx->format = codecContext->pix_fmt;
@@ -164,41 +150,44 @@ UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
     [[maybe_unused]] AVPixelFormat targetFormat,
     int frameIndex,
     AVCodecContext* codecContext) {
-  TORCH_CHECK(tensor.is_cuda(), "GpuEncoder requires CUDA tensors");
+  TORCH_CHECK(
+      tensor.is_cuda(),
+      "Frame tensor is not stored on GPU, but the GPU method convertTensorToAVFrame was called.");
   TORCH_CHECK(
       tensor.dim() == 3 && tensor.size(0) == 3,
       "Expected 3D RGB tensor (CHW format), got shape: ",
       tensor.sizes());
+
+  // TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+  int height = static_cast<int>(tensor.size(1));
+  int width = static_cast<int>(tensor.size(2));
 
   avFrame->format = AV_PIX_FMT_CUDA;
-  avFrame->width = static_cast<int>(tensor.size(2));
-  avFrame->height = static_cast<int>(tensor.size(1));
+  avFrame->height = height;
+  avFrame->width = width;
   avFrame->pts = frameIndex;
 
-  int ret = av_hwframe_get_buffer(
-      codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
+  // FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
+  // TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
+  // efficiency
+  int ret =
+      av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
   TORCH_CHECK(
       ret >= 0,
       "Failed to allocate hardware frame: ",
       getFFMPEGErrorStringFromErrorCode(ret));
 
-  // Validate that avFrame was properly allocated with CUDA memory
   TORCH_CHECK(
       avFrame != nullptr && avFrame->data[0] != nullptr,
       "avFrame must be pre-allocated with CUDA memory");
 
-  // Convert CHW to HWC for NPP processing
-  int height = static_cast<int>(tensor.size(1));
-  int width = static_cast<int>(tensor.size(2));
   torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
 
-  // Get current CUDA stream for NPP operations
   at::cuda::CUDAStream currentStream =
       at::cuda::getCurrentCUDAStream(device_.index());
 
-  // Setup NPP context with current stream
   nppCtx_->hStream = currentStream.stream();
   cudaError_t cudaErr =
       cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
@@ -207,9 +196,7 @@ UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
       "cudaStreamGetFlags failed: ",
       cudaGetErrorString(cudaErr));
 
-  // Always use FFmpeg's default behavior: BT.601 limited range
   NppiSize oSizeROI = {width, height};
-
   NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
       static_cast<const Npp8u*>(hwcFrame.data_ptr()),
       hwcFrame.stride(0) * hwcFrame.element_size(),
@@ -224,15 +211,8 @@ UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
       "Failed to convert RGB to NV12: NPP error code ",
       status);
 
-  // Validate CUDA operations completed successfully
-  cudaError_t memCheck = cudaGetLastError();
-  TORCH_CHECK(
-      memCheck == cudaSuccess,
-      "CUDA error detected: ",
-      cudaGetErrorString(memCheck));
-
   // TODO-VideoEncoder: Enable configuration of color properties, similar to
-  // FFmpeg Set color properties to FFmpeg defaults
+  // FFmpeg. Below are the default color properties used by FFmpeg.
   avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
   avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
 
diff --git a/src/torchcodec/_core/GpuEncoder.h b/src/torchcodec/_core/GpuEncoder.h
@@ -27,9 +27,8 @@ class GpuEncoder {
   explicit GpuEncoder(const torch::Device& device);
   ~GpuEncoder();
 
-  std::optional<const AVCodec*> findEncoder(const AVCodecID& codecId);
   void registerHardwareDeviceWithCodec(AVCodecContext* codecContext);
-  void setupEncodingContext(AVCodecContext* codecContext);
+  void setupHardwareFrameContext(AVCodecContext* codecContext);
 
   UniqueAVFrame convertTensorToAVFrame(
       const torch::Tensor& tensor,
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -41,6 +41,8 @@ struct VideoStreamOptions {
       ColorConversionLibrary::FILTERGRAPH;
 
   // By default we use CPU for decoding for both C++ and python users.
+  // Note: For video encoding, device is determined by the location of the input
+  // frame tensor.
   torch::Device device = torch::kCPU;
   // Device variant (e.g., "ffmpeg", "beta", etc.)
   std::string_view deviceVariant = "ffmpeg";
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -647,7 +647,6 @@ void encode_video_to_file(
     std::optional<std::string_view> preset = std::nullopt,
     std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
-  videoStreamOptions.device = frames.device();
   videoStreamOptions.codec = std::move(codec);
   videoStreamOptions.pixelFormat = std::move(pixel_format);
   videoStreamOptions.crf = crf;
@@ -672,7 +671,6 @@ at::Tensor encode_video_to_tensor(
     std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
-  videoStreamOptions.device = frames.device();
   videoStreamOptions.codec = std::move(codec);
   videoStreamOptions.pixelFormat = std::move(pixel_format);
   videoStreamOptions.crf = crf;
@@ -709,7 +707,6 @@ void _encode_video_to_file_like(
   std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
 
   VideoStreamOptions videoStreamOptions;
-  videoStreamOptions.device = frames.device();
   videoStreamOptions.codec = std::move(codec);
   videoStreamOptions.pixelFormat = std::move(pixel_format);
   videoStreamOptions.crf = crf;
diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -827,12 +827,16 @@ def test_contiguity(self, method, tmp_path, device):
         )
 
         def encode_to_tensor(frames):
-            common_params = dict(crf=0, pixel_format="yuv444p")
+            common_params = dict(
+                crf=0,
+                pixel_format="yuv444p",
+                codec="h264_nvenc" if device != "cpu" else None,
+            )
             if method == "to_file":
                 dest = str(tmp_path / "output.mp4")
                 VideoEncoder(frames, frame_rate=30).to_file(dest=dest, **common_params)
                 with open(dest, "rb") as f:
-                    return torch.frombuffer(f.read(), dtype=torch.uint8).clone()
+                    return torch.frombuffer(f.read(), dtype=torch.uint8)
             elif method == "to_tensor":
                 return VideoEncoder(frames, frame_rate=30).to_tensor(
                     format="mp4", **common_params
@@ -1269,7 +1273,6 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
 
     @pytest.mark.needs_cuda
     @pytest.mark.skipif(in_fbcode(), reason="ffmpeg CLI not available")
-    @pytest.mark.parametrize("pixel_format", ("nv12", "yuv420p"))
     @pytest.mark.parametrize(
         "format_codec",
         [
@@ -1280,12 +1283,12 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
         ],
     )
     @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
-    def test_nvenc_against_ffmpeg_cli(
-        self, tmp_path, pixel_format, format_codec, method
-    ):
+    # TODO-VideoEncoder: Enable additional pixel formats ("yuv420p", "yuv444p")
+    def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
         # Encode with FFmpeg CLI using nvenc codecs
         format, codec = format_codec
         device = "cuda"
+        pixel_format = "nv12"
         qp = 1  # Lossless (qp=0) is not supported on av1_nvenc, so we use 1
         source_frames = self.decode(TEST_SRC_2_720P.path).data.to(device)
 
@@ -1315,13 +1318,11 @@ def test_nvenc_against_ffmpeg_cli(
 
         ffmpeg_cmd.extend(["-pix_fmt", pixel_format])  # Output format
         if codec == "av1_nvenc":
-            ffmpeg_cmd.extend(
-                ["-rc", "constqp"]
-            )  # Set rate control mode for AV1        else:
+            ffmpeg_cmd.extend(["-rc", "constqp"])  # Set rate control mode for AV1
         ffmpeg_cmd.extend(["-qp", str(qp)])  # Use lossless qp for other codecs
         ffmpeg_cmd.extend([ffmpeg_encoded_path])
 
-        # Will this prevent CI from treating test as failed if NVENC is not available?
+        # TODO-VideoEncoder: Ensure CI does not skip this test, as we know NVENC is available.
         try:
             subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
         except subprocess.CalledProcessError as e: