WIP

NicolasHug · NicolasHug · commit 4a9c00cb87fe · 2024-11-05T11:06:58.000Z
diff --git a/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp b/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp
@@ -17,7 +17,7 @@ namespace facebook::torchcodec {
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamDecoderOptions& options,
-    AVCodecContext* codecContext,
+    const VideoDecoder::StreamMetadata& metadata,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -154,18 +154,6 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 #endif
 }
 
-torch::Tensor allocateDeviceTensor(
-    at::IntArrayRef shape,
-    torch::Device device,
-    const torch::Dtype dtype = torch::kUInt8) {
-  return torch::empty(
-      shape,
-      torch::TensorOptions()
-          .dtype(dtype)
-          .layout(torch::kStrided)
-          .device(device));
-}
-
 void throwErrorIfNonCudaDevice(const torch::Device& device) {
   TORCH_CHECK(
       device.type() != torch::kCPU,
@@ -199,7 +187,7 @@ void initializeContextOnCuda(
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamDecoderOptions& options,
-    AVCodecContext* codecContext,
+    const VideoDecoder::StreamMetadata& metadata,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
@@ -209,8 +197,9 @@ void convertAVFrameToDecodedOutputOnCuda(
       src->format == AV_PIX_FMT_CUDA,
       "Expected format to be AV_PIX_FMT_CUDA, got " +
           std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
-  int width = options.width.value_or(codecContext->width);
-  int height = options.height.value_or(codecContext->height);
+  int height = 0, width = 0;
+  std::tie(height, width) =
+      getHeightAndWidthFromOptionsOrMetadata(options, metadata);
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {src->data[0], src->data[1]};
   torch::Tensor& dst = output.frame;
@@ -227,7 +216,7 @@ void convertAVFrameToDecodedOutputOnCuda(
         "x3, got ",
         shape);
   } else {
-    dst = allocateDeviceTensor({height, width, 3}, options.device);
+    dst = allocateEmptyHWCTensor(height, width, options.device);
   }
 
   // Use the user-requested GPU for running the NPP kernel.
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -35,7 +35,7 @@ void initializeContextOnCuda(
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamDecoderOptions& options,
-    AVCodecContext* codecContext,
+    const VideoDecoder::StreamMetadata& metadata,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -193,7 +193,7 @@ VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
     const StreamMetadata& metadata)
     : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
       durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
-  int height, width;
+  int height = 0, width = 0;
   std::tie(height, width) =
       getHeightAndWidthFromOptionsOrMetadata(options, metadata);
   frames = allocateEmptyHWCTensor(height, width, options.device, numFrames);
@@ -359,12 +359,10 @@ void VideoDecoder::initializeFilterGraphForStream(
   inputs->pad_idx = 0;
   inputs->next = nullptr;
   char description[512];
-  int width = activeStream.codecContext->width;
-  int height = activeStream.codecContext->height;
-  if (options.height.has_value() && options.width.has_value()) {
-    width = *options.width;
-    height = *options.height;
-  }
+  int height = 0, width = 0;
+  std::tie(height, width) = getHeightAndWidthFromOptionsOrMetadata(
+      options, containerMetadata_.streams[streamIndex]);
+
   std::snprintf(
       description,
       sizeof(description),
@@ -862,7 +860,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
     convertAVFrameToDecodedOutputOnCuda(
         streamInfo.options.device,
         streamInfo.options,
-        streamInfo.codecContext.get(),
+        containerMetadata_.streams[streamIndex],
         rawOutput,
         output,
         preAllocatedOutputTensor);
@@ -1309,8 +1307,9 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale(
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(frame->format);
   StreamInfo& activeStream = streams_[streamIndex];
-  int outputWidth = activeStream.options.width.value_or(frame->width);
-  int outputHeight = activeStream.options.height.value_or(frame->height);
+  int outputHeight = 0, outputWidth = 0;
+  std::tie(outputHeight, outputWidth) =
+      getHeightAndWidthFromOptionsOrAVFrame(activeStream.options, frame);
   if (activeStream.swsContext.get() == nullptr) {
     SwsContext* swsContext = sws_getContext(
         frame->width,
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -414,6 +414,10 @@ class VideoDecoder {
   bool scanned_all_streams_ = false;
 };
 
+// --------------------------------------------------------------------------
+// FRAME TENSOR ALLOCATION APIs
+// --------------------------------------------------------------------------
+
 std::tuple<int, int> getHeightAndWidthFromOptionsOrMetadata(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     const VideoDecoder::StreamMetadata& metadata);