WIP

NicolasHug · NicolasHug · commit 0256e18abfa8 · 2024-11-04T16:49:29.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -191,14 +191,13 @@ VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
     int64_t numFrames,
     const VideoStreamDecoderOptions& options,
     const StreamMetadata& metadata)
-    : frames(torch::empty(
-          {numFrames,
-           options.height.value_or(*metadata.height),
-           options.width.value_or(*metadata.width),
-           3},
-          at::TensorOptions(options.device).dtype(torch::kUInt8))),
-      ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
-      durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {}
+    : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
+      durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
+  int height, width;
+  std::tie(height, width) =
+      getHeightAndWidthFromOptionsOrMetadata(options, metadata);
+  frames = allocateEmptyHWCTensor(height, width, options.device, numFrames);
+}
 
 VideoDecoder::VideoDecoder() {}
 
@@ -893,8 +892,9 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
   torch::Tensor tensor;
   if (output.streamType == AVMEDIA_TYPE_VIDEO) {
     if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
-      int width = streamInfo.options.width.value_or(frame->width);
-      int height = streamInfo.options.height.value_or(frame->height);
+      int height, width;
+      std::tie(height, width) =
+          getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, frame);
       if (preAllocatedOutputTensor.has_value()) {
         tensor = preAllocatedOutputTensor.value();
         auto shape = tensor.sizes();
@@ -908,8 +908,8 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
             "x3, got ",
             shape);
       } else {
-        tensor = torch::empty(
-            {height, width, 3}, torch::TensorOptions().dtype({torch::kUInt8}));
+        tensor = allocateEmptyHWCTensor(
+            height, width, streamInfo.options.device.type());
       }
       rawOutput.data = tensor.data_ptr<uint8_t>();
       convertFrameToBufferUsingSwsScale(rawOutput);
@@ -1400,6 +1400,38 @@ VideoDecoder::~VideoDecoder() {
   }
 }
 
+std::tuple<int, int> getHeightAndWidthFromOptionsOrMetadata(
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::StreamMetadata& metadata) {
+  return std::make_tuple(
+      options.height.value_or(*metadata.height),
+      options.width.value_or(*metadata.width));
+}
+
+std::tuple<int, int> getHeightAndWidthFromOptionsOrAVFrame(
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    AVFrame* avFrame) {
+  return std::make_tuple(
+      options.height.value_or(avFrame->height),
+      options.width.value_or(avFrame->width));
+}
+
+torch::Tensor allocateEmptyHWCTensor(
+    int height,
+    int width,
+    torch::Device device,
+    std::optional<int> numFrames) {
+  auto tensorOptions = torch::TensorOptions()
+                           .dtype(torch::kUInt8)
+                           .layout(torch::kStrided)
+                           .device(device);
+  if (numFrames.has_value()) {
+    return torch::empty({numFrames.value(), height, width, 3}, tensorOptions);
+  } else {
+    return torch::empty({height, width, 3}, tensorOptions);
+  }
+}
+
 std::ostream& operator<<(
     std::ostream& os,
     const VideoDecoder::DecodeStats& stats) {
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -243,6 +243,7 @@ class VideoDecoder {
         const VideoStreamDecoderOptions& options,
         const StreamMetadata& metadata);
   };
+
   // Returns frames at the given indices for a given stream as a single stacked
   // Tensor.
   BatchDecodedOutput getFramesAtIndices(
@@ -413,6 +414,20 @@ class VideoDecoder {
   bool scanned_all_streams_ = false;
 };
 
+std::tuple<int, int> getHeightAndWidthFromOptionsOrMetadata(
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::StreamMetadata& metadata);
+
+std::tuple<int, int> getHeightAndWidthFromOptionsOrAVFrame(
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    AVFrame* avFrame);
+
+torch::Tensor allocateEmptyHWCTensor(
+    int height,
+    int width,
+    torch::Device device,
+    std::optional<int> numFrames = std::nullopt);
+
 // Prints the VideoDecoder::DecodeStats to the ostream.
 std::ostream& operator<<(
     std::ostream& os,