@@ -1088,32 +1088,10 @@ void SingleStreamDecoder::setCursor(int64_t pts) {
10881088 cursor_ = pts;
10891089}
10901090
1091- /*
1092- Videos have I frames and non-I frames (P and B frames). Non-I frames need data
1093- from the previous I frame to be decoded.
1094-
1095- Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x for
1096- brevity) and we wish to seek to a user-specified PTS=y.
1097-
1098- If y < x, we don't have a choice but to seek backwards to the highest I frame
1099- before y.
1100-
1101- If y > x, we have two choices:
1102-
1103- 1. We could keep decoding forward until we hit y. Illustrated below:
1104-
1105- I P P P I P P P I P P I P P I P
1106- x y
1107-
1108- 2. We could try to jump to an I frame between x and y (indicated by j below).
1109- And then start decoding until we encounter y. Illustrated below:
1110-
1111- I P P P I P P P I P P I P P I P
1112- x j y
1113-
1114- (2) is more efficient than (1) if there is an I frame between x and y.
1115- */
11161091bool SingleStreamDecoder::canWeAvoidSeeking () const {
1092+ // Returns true if we can avoid seeking in the AVFormatContext based on
1093+ // heuristics that rely on the target cursor_ and the last decoded frame.
1094+ // Seeking is expensive, so we try to avoid it when possible.
11171095 const StreamInfo& streamInfo = streamInfos_.at (activeStreamIndex_);
11181096 if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
11191097 // For audio, we only need to seek if a backwards seek was requested
@@ -1136,13 +1114,34 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
11361114 // implement caching.
11371115 return false ;
11381116 }
1139- // We are seeking forwards.
1140- // We can only skip a seek if both lastDecodedAvFramePts and
1141- // cursor_ share the same keyframe.
1142- int lastDecodedAvFrameIndex = getKeyFrameIndexForPts (lastDecodedAvFramePts_);
1143- int targetKeyFrameIndex = getKeyFrameIndexForPts (cursor_);
1144- return lastDecodedAvFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
1145- lastDecodedAvFrameIndex == targetKeyFrameIndex;
1117+ // We are seeking forwards. We can skip a seek if both the last decoded frame
1118+ // and cursor_ share the same keyframe:
1119+ // Videos have I frames and non-I frames (P and B frames). Non-I frames need
1120+ // data from the previous I frame to be decoded.
1121+ //
1122+ // Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x
1123+ // for brevity) and we wish to seek to a user-specified PTS=y.
1124+ //
1125+ // If y < x, we don't have a choice but to seek backwards to the highest I
1126+ // frame before y.
1127+ //
1128+ // If y > x, we have two choices:
1129+ //
1130+ // 1. We could keep decoding forward until we hit y. Illustrated below:
1131+ //
1132+ // I P P P I P P P I P P I P
1133+ // x y
1134+ //
1135+ // 2. We could try to jump to an I frame between x and y (indicated by j
1136+ // below). And then start decoding until we encounter y. Illustrated below:
1137+ //
1138+ // I P P P I P P P I P P I P
1139+ // x j y
1140+ // (2) is only more efficient than (1) if there is an I frame between x and y.
1141+ int lastKeyFrame = getKeyFrameIdentifier (lastDecodedAvFramePts_);
1142+ int targetKeyFrame = getKeyFrameIdentifier (cursor_);
1143+ return lastKeyFrame >= 0 && targetKeyFrame >= 0 &&
1144+ lastKeyFrame == targetKeyFrame;
11461145}
11471146
11481147// This method looks at currentPts and desiredPts and seeks in the
@@ -1359,7 +1358,19 @@ torch::Tensor SingleStreamDecoder::maybePermuteHWC2CHW(
13591358// PTS <-> INDEX CONVERSIONS
13601359// --------------------------------------------------------------------------
13611360
1362- int SingleStreamDecoder::getKeyFrameIndexForPts (int64_t pts) const {
1361+ int SingleStreamDecoder::getKeyFrameIdentifier (int64_t pts) const {
1362+ // This function "identifies" a key frame for a given pts value.
1363+ // We use the term "identifier" rather than "index" because the nature of the
1364+ // index that is returned depends on various factors:
1365+ // - If seek_mode is exact, we return the index of the key frame in the
1366+ // scanned key-frame vector (streamInfo.keyFrames). So the returned value is
1367+ // in [0, num_key_frames).
1368+ // - If seek_mode is approximate, we use av_index_search_timestamp() which
1369+ // may return a value in [0, num_key_frames) like for mkv, but also a value
1370+ // in [0, num_frames) like for mp4. It really depends on the container.
1371+ //
1372+ // The range of the "identifier" doesn't matter that much, for now we only
1373+ // use it to uniquely identify a key frame in canWeAvoidSeeking().
13631374 const StreamInfo& streamInfo = streamInfos_.at (activeStreamIndex_);
13641375 if (streamInfo.keyFrames .empty ()) {
13651376 return av_index_search_timestamp (
0 commit comments