meta-pytorch
diff --git a/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 47 additions & 9 deletions b/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 47 additions & 9 deletions
diff --git a/‎src/torchcodec/_core/Metadata.cpp‎
Lines changed: 121 additions & 0 deletions b/‎src/torchcodec/_core/Metadata.cpp‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/Metadata.h‎
Lines changed: 9 additions & 0 deletions b/‎src/torchcodec/_core/Metadata.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/SingleStreamDecoder.cpp‎
Lines changed: 17 additions & 48 deletions b/‎src/torchcodec/_core/SingleStreamDecoder.cpp‎
Lines changed: 17 additions & 48 deletions
diff --git a/‎src/torchcodec/_core/SingleStreamDecoder.h‎
Lines changed: 7 additions & 6 deletions b/‎src/torchcodec/_core/SingleStreamDecoder.h‎
Lines changed: 7 additions & 6 deletions
@@ -96,6 +96,7 @@ function(make_torchcodec_libraries
         Encoder.cpp
         ValidationUtils.cpp
         Transform.cpp
+        Metadata.cpp
     )
 
     if(ENABLE_CUDA)
 
@@ -4,6 +4,10 @@
 #include "Encoder.h"
 #include "torch/types.h"
 
+extern "C" {
+#include <libavutil/pixdesc.h>
+}
+
 namespace facebook::torchcodec {
 
 namespace {
@@ -534,6 +538,36 @@ torch::Tensor validateFrames(const torch::Tensor& frames) {
   return frames.contiguous();
 }
 
+AVPixelFormat validatePixelFormat(
+    const AVCodec& avCodec,
+    const std::string& targetPixelFormat) {
+  AVPixelFormat pixelFormat = av_get_pix_fmt(targetPixelFormat.c_str());
+
+  // Validate that the encoder supports this pixel format
+  const AVPixelFormat* supportedFormats = getSupportedPixelFormats(avCodec);
+  if (supportedFormats != nullptr) {
+    for (int i = 0; supportedFormats[i] != AV_PIX_FMT_NONE; ++i) {
+      if (supportedFormats[i] == pixelFormat) {
+        return pixelFormat;
+      }
+    }
+  }
+
+  std::stringstream errorMsg;
+  // av_get_pix_fmt failed to find a pix_fmt
+  if (pixelFormat == AV_PIX_FMT_NONE) {
+    errorMsg << "Unknown pixel format: " << targetPixelFormat;
+  } else {
+    errorMsg << "Specified pixel format " << targetPixelFormat
+             << " is not supported by the " << avCodec.name << " encoder.";
+  }
+  // Build error message, similar to FFmpeg's error log
+  errorMsg << "\nSupported pixel formats for " << avCodec.name << ":";
+  for (int i = 0; supportedFormats[i] != AV_PIX_FMT_NONE; ++i) {
+    errorMsg << " " << av_get_pix_fmt_name(supportedFormats[i]);
+  }
+  TORCH_CHECK(false, errorMsg.str());
+}
 } // namespace
 
 VideoEncoder::~VideoEncoder() {
@@ -635,15 +669,19 @@ void VideoEncoder::initializeEncoder(
   outWidth_ = inWidth_;
   outHeight_ = inHeight_;
 
-  // TODO-VideoEncoder: Enable other pixel formats
-  // Let FFmpeg choose best pixel format to minimize loss
-  outPixelFormat_ = avcodec_find_best_pix_fmt_of_list(
-      getSupportedPixelFormats(*avCodec), // List of supported formats
-      AV_PIX_FMT_GBRP, // We reorder input to GBRP currently
-      0, // No alpha channel
-      nullptr // Discard conversion loss information
-  );
-  TORCH_CHECK(outPixelFormat_ != -1, "Failed to find best pix fmt")
+  if (videoStreamOptions.pixelFormat.has_value()) {
+    outPixelFormat_ =
+        validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
+  } else {
+    const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
+    // Use first listed pixel format as default (often yuv420p).
+    // This is similar to FFmpeg's logic:
+    // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
+    // If pixel formats are undefined for some reason, try yuv420p
+    outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
+        ? formats[0]
+        : AV_PIX_FMT_YUV420P;
+  }
 
   // Configure codec parameters
   avCodecContext_->codec_id = avCodec->id;
 
@@ -0,0 +1,121 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "Metadata.h"
+#include "torch/types.h"
+
+namespace facebook::torchcodec {
+
+std::optional<double> StreamMetadata::getDurationSeconds(
+    SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      TORCH_CHECK(
+          endStreamPtsSecondsFromContent.has_value() &&
+              beginStreamPtsSecondsFromContent.has_value(),
+          "Missing beginStreamPtsSecondsFromContent or endStreamPtsSecondsFromContent");
+      return endStreamPtsSecondsFromContent.value() -
+          beginStreamPtsSecondsFromContent.value();
+    case SeekMode::approximate:
+      if (durationSecondsFromHeader.has_value()) {
+        return durationSecondsFromHeader.value();
+      }
+      if (numFramesFromHeader.has_value() && averageFpsFromHeader.has_value() &&
+          averageFpsFromHeader.value() != 0.0) {
+        return static_cast<double>(numFramesFromHeader.value()) /
+            averageFpsFromHeader.value();
+      }
+      return std::nullopt;
+    default:
+      TORCH_CHECK(false, "Unknown SeekMode");
+  }
+}
+
+double StreamMetadata::getBeginStreamSeconds(SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      TORCH_CHECK(
+          beginStreamPtsSecondsFromContent.has_value(),
+          "Missing beginStreamPtsSecondsFromContent");
+      return beginStreamPtsSecondsFromContent.value();
+    case SeekMode::approximate:
+      if (beginStreamPtsSecondsFromContent.has_value()) {
+        return beginStreamPtsSecondsFromContent.value();
+      }
+      return 0.0;
+    default:
+      TORCH_CHECK(false, "Unknown SeekMode");
+  }
+}
+
+std::optional<double> StreamMetadata::getEndStreamSeconds(
+    SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      TORCH_CHECK(
+          endStreamPtsSecondsFromContent.has_value(),
+          "Missing endStreamPtsSecondsFromContent");
+      return endStreamPtsSecondsFromContent.value();
+    case SeekMode::approximate:
+      if (endStreamPtsSecondsFromContent.has_value()) {
+        return endStreamPtsSecondsFromContent.value();
+      }
+      return getDurationSeconds(seekMode);
+    default:
+      TORCH_CHECK(false, "Unknown SeekMode");
+  }
+}
+
+std::optional<int64_t> StreamMetadata::getNumFrames(SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      TORCH_CHECK(
+          numFramesFromContent.has_value(), "Missing numFramesFromContent");
+      return numFramesFromContent.value();
+    case SeekMode::approximate: {
+      if (numFramesFromHeader.has_value()) {
+        return numFramesFromHeader.value();
+      }
+      if (averageFpsFromHeader.has_value() &&
+          durationSecondsFromHeader.has_value()) {
+        return static_cast<int64_t>(
+            averageFpsFromHeader.value() * durationSecondsFromHeader.value());
+      }
+      return std::nullopt;
+    }
+    default:
+      TORCH_CHECK(false, "Unknown SeekMode");
+  }
+}
+
+std::optional<double> StreamMetadata::getAverageFps(SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact: {
+      auto numFrames = getNumFrames(seekMode);
+      if (numFrames.has_value() &&
+          beginStreamPtsSecondsFromContent.has_value() &&
+          endStreamPtsSecondsFromContent.has_value()) {
+        double duration = endStreamPtsSecondsFromContent.value() -
+            beginStreamPtsSecondsFromContent.value();
+        if (duration != 0.0) {
+          return static_cast<double>(numFrames.value()) / duration;
+        }
+      }
+      return averageFpsFromHeader;
+    }
+    case SeekMode::approximate:
+      return averageFpsFromHeader;
+    default:
+      TORCH_CHECK(false, "Unknown SeekMode");
+  }
+}
+
+} // namespace facebook::torchcodec
@@ -18,6 +18,8 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
+enum class SeekMode { exact, approximate, custom_frame_mappings };
+
 struct StreamMetadata {
   // Common (video and audio) fields derived from the AVStream.
   int streamIndex;
@@ -52,6 +54,13 @@ struct StreamMetadata {
   std::optional<int64_t> sampleRate;
   std::optional<int64_t> numChannels;
   std::optional<std::string> sampleFormat;
+
+  // Computed methods with fallback logic
+  std::optional<double> getDurationSeconds(SeekMode seekMode) const;
+  double getBeginStreamSeconds(SeekMode seekMode) const;
+  std::optional<double> getEndStreamSeconds(SeekMode seekMode) const;
+  std::optional<int64_t> getNumFrames(SeekMode seekMode) const;
+  std::optional<double> getAverageFps(SeekMode seekMode) const;
 };
 
 struct ContainerMetadata {
 
@@ -367,6 +367,14 @@ ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
   return containerMetadata_;
 }
 
+SeekMode SingleStreamDecoder::getSeekMode() const {
+  return seekMode_;
+}
+
+int SingleStreamDecoder::getActiveStreamIndex() const {
+  return activeStreamIndex_;
+}
+
 torch::Tensor SingleStreamDecoder::getKeyFrameIndices() {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
   validateScannedAllStreams("getKeyFrameIndices");
@@ -611,7 +619,7 @@ FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
   const auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
 
-  std::optional<int64_t> numFrames = getNumFrames(streamMetadata);
+  std::optional<int64_t> numFrames = streamMetadata.getNumFrames(seekMode_);
   if (numFrames.has_value()) {
     // If the frameIndex is negative, we convert it to a positive index
     frameIndex = frameIndex >= 0 ? frameIndex : frameIndex + numFrames.value();
@@ -705,7 +713,7 @@ FrameBatchOutput SingleStreamDecoder::getFramesInRange(
 
   // Note that if we do not have the number of frames available in our
   // metadata, then we assume that the upper part of the range is valid.
-  std::optional<int64_t> numFrames = getNumFrames(streamMetadata);
+  std::optional<int64_t> numFrames = streamMetadata.getNumFrames(seekMode_);
   if (numFrames.has_value()) {
     TORCH_CHECK(
         stop <= numFrames.value(),
@@ -779,8 +787,9 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
   const auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
 
-  double minSeconds = getMinSeconds(streamMetadata);
-  std::optional<double> maxSeconds = getMaxSeconds(streamMetadata);
+  double minSeconds = streamMetadata.getBeginStreamSeconds(seekMode_);
+  std::optional<double> maxSeconds =
+      streamMetadata.getEndStreamSeconds(seekMode_);
 
   // The frame played at timestamp t and the one played at timestamp `t +
   // eps` are probably the same frame, with the same index. The easiest way to
@@ -857,7 +866,7 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
     return frameBatchOutput;
   }
 
-  double minSeconds = getMinSeconds(streamMetadata);
+  double minSeconds = streamMetadata.getBeginStreamSeconds(seekMode_);
   TORCH_CHECK(
       startSeconds >= minSeconds,
       "Start seconds is " + std::to_string(startSeconds) +
@@ -866,7 +875,8 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
 
   // Note that if we can't determine the maximum seconds from the metadata,
   // then we assume upper range is valid.
-  std::optional<double> maxSeconds = getMaxSeconds(streamMetadata);
+  std::optional<double> maxSeconds =
+      streamMetadata.getEndStreamSeconds(seekMode_);
   if (maxSeconds.has_value()) {
     TORCH_CHECK(
         startSeconds < maxSeconds.value(),
@@ -1439,47 +1449,6 @@ int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
 // STREAM AND METADATA APIS
 // --------------------------------------------------------------------------
 
-std::optional<int64_t> SingleStreamDecoder::getNumFrames(
-    const StreamMetadata& streamMetadata) {
-  switch (seekMode_) {
-    case SeekMode::custom_frame_mappings:
-    case SeekMode::exact:
-      return streamMetadata.numFramesFromContent.value();
-    case SeekMode::approximate: {
-      return streamMetadata.numFramesFromHeader;
-    }
-    default:
-      TORCH_CHECK(false, "Unknown SeekMode");
-  }
-}
-
-double SingleStreamDecoder::getMinSeconds(
-    const StreamMetadata& streamMetadata) {
-  switch (seekMode_) {
-    case SeekMode::custom_frame_mappings:
-    case SeekMode::exact:
-      return streamMetadata.beginStreamPtsSecondsFromContent.value();
-    case SeekMode::approximate:
-      return 0;
-    default:
-      TORCH_CHECK(false, "Unknown SeekMode");
-  }
-}
-
-std::optional<double> SingleStreamDecoder::getMaxSeconds(
-    const StreamMetadata& streamMetadata) {
-  switch (seekMode_) {
-    case SeekMode::custom_frame_mappings:
-    case SeekMode::exact:
-      return streamMetadata.endStreamPtsSecondsFromContent.value();
-    case SeekMode::approximate: {
-      return streamMetadata.durationSecondsFromHeader;
-    }
-    default:
-      TORCH_CHECK(false, "Unknown SeekMode");
-  }
-}
-
 // --------------------------------------------------------------------------
 // VALIDATION UTILS
 // --------------------------------------------------------------------------
@@ -1529,7 +1498,7 @@ void SingleStreamDecoder::validateFrameIndex(
 
   // Note that if we do not have the number of frames available in our
   // metadata, then we assume that the frameIndex is valid.
-  std::optional<int64_t> numFrames = getNumFrames(streamMetadata);
+  std::optional<int64_t> numFrames = streamMetadata.getNumFrames(seekMode_);
   if (numFrames.has_value()) {
     if (frameIndex >= numFrames.value()) {
       throw std::out_of_range(
 
@@ -16,6 +16,7 @@
 #include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
 #include "Frame.h"
+#include "Metadata.h"
 #include "StreamOptions.h"
 #include "Transform.h"
 
@@ -30,8 +31,6 @@ class SingleStreamDecoder {
   // CONSTRUCTION API
   // --------------------------------------------------------------------------
 
-  enum class SeekMode { exact, approximate, custom_frame_mappings };
-
   // Creates a SingleStreamDecoder from the video at videoFilePath.
   explicit SingleStreamDecoder(
       const std::string& videoFilePath,
@@ -60,6 +59,12 @@ class SingleStreamDecoder {
   // Returns the metadata for the container.
   ContainerMetadata getContainerMetadata() const;
 
+  // Returns the seek mode of this decoder.
+  SeekMode getSeekMode() const;
+
+  // Returns the active stream index. Returns -2 if no stream is active.
+  int getActiveStreamIndex() const;
+
   // Returns the key frame indices as a tensor. The tensor is 1D and contains
   // int64 values, where each value is the frame index for a key frame.
   torch::Tensor getKeyFrameIndices();
@@ -312,10 +317,6 @@ class SingleStreamDecoder {
   // index. Note that this index may be truncated for some files.
   int getBestStreamIndex(AVMediaType mediaType);
 
-  std::optional<int64_t> getNumFrames(const StreamMetadata& streamMetadata);
-  double getMinSeconds(const StreamMetadata& streamMetadata);
-  std::optional<double> getMaxSeconds(const StreamMetadata& streamMetadata);
-
   // --------------------------------------------------------------------------
   // VALIDATION UTILS
   // --------------------------------------------------------------------------
Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@ function(make_torchcodec_libraries`
`96`	`96`	`Encoder.cpp`
`97`	`97`	`ValidationUtils.cpp`
`98`	`98`	`Transform.cpp`
	`99`	`+ Metadata.cpp`
`99`	`100`	`)`
`100`	`101`
`101`	`102`	`if(ENABLE_CUDA)`