2025-12-02 nightly release (2311422)

pytorchbot · pytorchbot · commit af16703b860d · 2025-12-02T11:37:15.000Z
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -3,6 +3,7 @@ sphinx==5.0.0
 sphinx_design
 sphinx_copybutton
 sphinx-tabs
+sphinx-sitemap
 matplotlib
 torchvision
 ipython
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -55,6 +55,7 @@
     "sphinx_tabs.tabs",
     "sphinx_design",
     "sphinx_copybutton",
+    "sphinx_sitemap",
 ]
 
 
@@ -216,6 +217,15 @@ def __call__(self, filename):
     "matplotlib": ("https://matplotlib.org/stable/", None),
 }
 
+# sitemap config
+html_baseurl = "https://meta-pytorch.org/torchcodec/stable/"
+sitemap_locales = [None]
+sitemap_excludes = [
+    "search.html",
+    "genindex.html",
+]
+sitemap_url_scheme = "{link}"
+
 
 def inject_minigalleries(app, what, name, obj, options, lines):
     """Inject a minigallery into a docstring.
diff --git a/examples/decoding/approximate_mode.py b/examples/decoding/approximate_mode.py
@@ -66,7 +66,7 @@
 # Performance: ``VideoDecoder`` creation
 # --------------------------------------
 #
-# In terms of performance, the ``seek_mode`` parameter ultimately affects the
+# In terms of performance, the ``seek_mode`` parameter mainly affects the
 # **creation** of a :class:`~torchcodec.decoders.VideoDecoder` object. The
 # longer the video, the higher the performance gain.
 
@@ -104,7 +104,7 @@ def bench(f, average_over=50, warmup=2, **f_kwargs):
 # ---------------------------------------------
 #
 # Strictly speaking the ``seek_mode`` parameter only affects the performance of
-# the :class:`~torchcodec.decoders.VideoDecoder` creation. It does not have a
+# the :class:`~torchcodec.decoders.VideoDecoder` creation. It usually does not have a
 # direct effect on the performance of frame decoding or sampling.  **However**,
 # because frame decoding and sampling patterns typically involve the creation of
 # the :class:`~torchcodec.decoders.VideoDecoder` (one per video), ``seek_mode``
@@ -168,20 +168,21 @@ def sample_clips(seek_mode):
 # duration), and also builds an internal index of frames and key-frames. This
 # internal index is potentially more accurate than the one in the file's
 # headers, which leads to more accurate seeking behavior.
-# Without the scan, TorchCodec relies only on the metadata contained in the
-# file, which may not always be as accurate.
+# Without the scan (in approximate mode), TorchCodec relies only on the metadata
+# contained in the file, which may not always be as accurate. In some rare
+# cases, relying on this less accurate data may also lead to slower frame
+# decoding, because it can involve unnecessary seeks.
 #
 # Which mode should I use?
 # ------------------------
 #
 # The general rule of thumb is as follows:
 #
 # - If you really care about exactness of frame seeking, use "exact".
-# - If you can sacrifice exactness of seeking for speed, which is usually the
-#   case when doing clip sampling, use "approximate".
-# - If your videos don't have variable framerate and their metadata is correct,
-#   then "approximate" mode is a net win: it will be just as accurate as the
-#   "exact" mode while still being significantly faster.
+# - If your videos are short (less then a few minutes) then "exact" will usually
+#   be preferable, as the scan's fixed cost will be negligible.
+# - For long videos, if you can sacrifice exactness of seeking for speed, which
+#   is usually the case when doing clip sampling, consider using "approximate".
 
 # %%
 shutil.rmtree(temp_dir)
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1088,32 +1088,10 @@ void SingleStreamDecoder::setCursor(int64_t pts) {
   cursor_ = pts;
 }
 
-/*
-Videos have I frames and non-I frames (P and B frames). Non-I frames need data
-from the previous I frame to be decoded.
-
-Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x for
-brevity) and we wish to seek to a user-specified PTS=y.
-
-If y < x, we don't have a choice but to seek backwards to the highest I frame
-before y.
-
-If y > x, we have two choices:
-
-1. We could keep decoding forward until we hit y. Illustrated below:
-
-I    P     P    P    I    P    P    P    I    P    P    I    P    P    I    P
-                          x         y
-
-2. We could try to jump to an I frame between x and y (indicated by j below).
-And then start decoding until we encounter y. Illustrated below:
-
-I    P     P    P    I    P    P    P    I    P    P    I    P    P    I    P
-                          x              j         y
-
-(2) is more efficient than (1) if there is an I frame between x and y.
-*/
 bool SingleStreamDecoder::canWeAvoidSeeking() const {
+  // Returns true if we can avoid seeking in the AVFormatContext based on
+  // heuristics that rely on the target cursor_ and the last decoded frame.
+  // Seeking is expensive, so we try to avoid it when possible.
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     // For audio, we only need to seek if a backwards seek was requested
@@ -1136,13 +1114,34 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
     // implement caching.
     return false;
   }
-  // We are seeking forwards.
-  // We can only skip a seek if both lastDecodedAvFramePts and
-  // cursor_ share the same keyframe.
-  int lastDecodedAvFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts_);
-  int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_);
-  return lastDecodedAvFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
-      lastDecodedAvFrameIndex == targetKeyFrameIndex;
+  // We are seeking forwards. We can skip a seek if both the last decoded frame
+  // and cursor_ share the same keyframe:
+  // Videos have I frames and non-I frames (P and B frames). Non-I frames need
+  // data from the previous I frame to be decoded.
+  //
+  // Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x
+  // for brevity) and we wish to seek to a user-specified PTS=y.
+  //
+  // If y < x, we don't have a choice but to seek backwards to the highest I
+  // frame before y.
+  //
+  // If y > x, we have two choices:
+  //
+  // 1. We could keep decoding forward until we hit y. Illustrated below:
+  //
+  // I    P     P    P    I    P    P    P    I    P    P    I    P
+  //                           x         y
+  //
+  // 2. We could try to jump to an I frame between x and y (indicated by j
+  // below). And then start decoding until we encounter y. Illustrated below:
+  //
+  // I    P     P    P    I    P    P    P    I    P    P    I    P
+  //                           x              j         y
+  // (2) is only more efficient than (1) if there is an I frame between x and y.
+  int lastKeyFrame = getKeyFrameIdentifier(lastDecodedAvFramePts_);
+  int targetKeyFrame = getKeyFrameIdentifier(cursor_);
+  return lastKeyFrame >= 0 && targetKeyFrame >= 0 &&
+      lastKeyFrame == targetKeyFrame;
 }
 
 // This method looks at currentPts and desiredPts and seeks in the
@@ -1359,7 +1358,19 @@ torch::Tensor SingleStreamDecoder::maybePermuteHWC2CHW(
 // PTS <-> INDEX CONVERSIONS
 // --------------------------------------------------------------------------
 
-int SingleStreamDecoder::getKeyFrameIndexForPts(int64_t pts) const {
+int SingleStreamDecoder::getKeyFrameIdentifier(int64_t pts) const {
+  // This function "identifies" a key frame for a given pts value.
+  // We use the term "identifier" rather than "index" because the nature of the
+  // index that is returned depends on various factors:
+  // - If seek_mode is exact, we return the index of the key frame in the
+  //   scanned key-frame vector (streamInfo.keyFrames). So the returned value is
+  //   in [0, num_key_frames).
+  // - If seek_mode is approximate, we use av_index_search_timestamp() which
+  //   may return a value in [0, num_key_frames) like for mkv, but also a value
+  //   in [0, num_frames) like for mp4. It really depends on the container.
+  //
+  //  The range of the "identifier" doesn't matter that much, for now we only
+  //  use it to uniquely identify a key frame in canWeAvoidSeeking().
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.keyFrames.empty()) {
     return av_index_search_timestamp(
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -282,7 +282,7 @@ class SingleStreamDecoder {
   // PTS <-> INDEX CONVERSIONS
   // --------------------------------------------------------------------------
 
-  int getKeyFrameIndexForPts(int64_t pts) const;
+  int getKeyFrameIdentifier(int64_t pts) const;
 
   // Returns the key frame index of the presentation timestamp using our index.
   // We build this index by scanning the file in
diff --git a/test/conftest.py b/test/conftest.py
@@ -4,12 +4,17 @@
 import pytest
 import torch
 
+from .utils import in_fbcode
+
 
 def pytest_configure(config):
     # register an additional marker (see pytest_collection_modifyitems)
     config.addinivalue_line(
         "markers", "needs_cuda: mark for tests that rely on a CUDA device"
     )
+    config.addinivalue_line(
+        "markers", "needs_ffmpeg_cli: mark for tests that rely on ffmpeg"
+    )
 
 
 def pytest_collection_modifyitems(items):
@@ -28,6 +33,15 @@ def pytest_collection_modifyitems(items):
         # 'needs_cuda' mark, and the ones with device == 'cpu' won't have the
         # mark.
         needs_cuda = item.get_closest_marker("needs_cuda") is not None
+        needs_ffmpeg_cli = item.get_closest_marker("needs_ffmpeg_cli") is not None
+        has_skip_marker = item.get_closest_marker("skip") is not None
+        has_skipif_marker = item.get_closest_marker("skipif") is not None
+
+        if in_fbcode():
+            # fbcode doesn't like skipping tests, so instead we  just don't collect the test
+            # so that they don't even "exist", hence the continue statements.
+            if needs_ffmpeg_cli or has_skip_marker or has_skipif_marker:
+                continue
 
         if (
             needs_cuda
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -29,6 +29,7 @@
     BT709_FULL_RANGE,
     cuda_version_used_for_building_torch,
     get_ffmpeg_major_version,
+    get_python_version,
     H264_10BITS,
     H265_10BITS,
     H265_VIDEO,
@@ -39,6 +40,7 @@
     NASA_AUDIO_MP3_44100,
     NASA_VIDEO,
     needs_cuda,
+    needs_ffmpeg_cli,
     psnr,
     SINE_MONO_S16,
     SINE_MONO_S32,
@@ -1146,6 +1148,10 @@ def test_get_key_frame_indices(self, device):
 
     # TODO investigate why this fails internally.
     @pytest.mark.skipif(in_fbcode(), reason="Compile test fails internally.")
+    @pytest.mark.skipif(
+        get_python_version() >= (3, 14),
+        reason="torch.compile is not supported on Python 3.14+",
+    )
     @pytest.mark.parametrize("device", all_supported_devices())
     def test_compile(self, device):
         decoder, device = make_video_decoder(NASA_VIDEO.path, device=device)
@@ -1311,10 +1317,7 @@ def setup_frame_mappings(tmp_path, file, stream_index):
             # Return the custom frame mappings as a JSON string
             return custom_frame_mappings
 
-    @pytest.mark.skipif(
-        in_fbcode(),
-        reason="ffprobe not available internally",
-    )
+    @needs_ffmpeg_cli
     @pytest.mark.parametrize("device", all_supported_devices())
     @pytest.mark.parametrize("stream_index", [0, 3])
     @pytest.mark.parametrize(
@@ -1361,10 +1364,7 @@ def test_custom_frame_mappings_json_and_bytes(
             ),
         )
 
-    @pytest.mark.skipif(
-        in_fbcode(),
-        reason="ffprobe not available internally",
-    )
+    @needs_ffmpeg_cli
     @pytest.mark.parametrize("device", all_supported_devices())
     @pytest.mark.parametrize(
         "custom_frame_mappings,expected_match",
diff --git a/test/test_encoders.py b/test/test_encoders.py
diff --git a/test/test_ops.py b/test/test_ops.py
diff --git a/test/utils.py b/test/utils.py