Skip to content

Commit ddd9b23

Browse files
author
pytorchbot
committed
2025-11-21 nightly release (45ab588)
1 parent 45907c3 commit ddd9b23

File tree

11 files changed

+420
-68
lines changed

11 files changed

+420
-68
lines changed

docs/source/api_ref_encoders.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_encoding_a
1616
:template: class.rst
1717

1818
AudioEncoder
19+
VideoEncoder

docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def __call__(self, filename):
8787
assert "examples/encoding" in self.src_dir
8888
order = [
8989
"audio_encoding.py",
90+
"video_encoding.py",
9091
]
9192

9293
try:

docs/source/index.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,14 @@ Encoding
9898

9999
How encode audio samples
100100

101+
.. grid-item-card:: :octicon:`file-code;1em`
102+
Video Encoding
103+
:img-top: _static/img/card-background.svg
104+
:link: generated_examples/encoding/video_encoding.html
105+
:link-type: url
106+
107+
How to encode video frames
108+
101109
.. toctree::
102110
:maxdepth: 1
103111
:caption: TorchCodec documentation
Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
=======================================
9+
Encoding video frames with VideoEncoder
10+
=======================================
11+
12+
In this example, we'll learn how to encode video frames to a file or to raw
13+
bytes using the :class:`~torchcodec.encoders.VideoEncoder` class.
14+
"""
15+
16+
# %%
17+
# First, we'll download a video and decode some frames to tensors.
18+
# These will be the input to the :class:`~torchcodec.encoders.VideoEncoder`. For more details on decoding,
19+
# see :ref:`sphx_glr_generated_examples_decoding_basic_example.py`.
20+
# Otherwise, skip ahead to :ref:`creating_encoder`.
21+
22+
import requests
23+
from torchcodec.decoders import VideoDecoder
24+
from IPython.display import Video
25+
26+
27+
def play_video(encoded_bytes):
28+
return Video(
29+
data=encoded_bytes.numpy().tobytes(),
30+
embed=True,
31+
width=640,
32+
height=360,
33+
mimetype="video/mp4",
34+
)
35+
36+
37+
# Video source: https://www.pexels.com/video/adorable-cats-on-the-lawn-4977395/
38+
# Author: Altaf Shah.
39+
url = "https://videos.pexels.com/video-files/4977395/4977395-hd_1920_1080_24fps.mp4"
40+
41+
response = requests.get(url, headers={"User-Agent": ""})
42+
if response.status_code != 200:
43+
raise RuntimeError(f"Failed to download video. {response.status_code = }.")
44+
45+
raw_video_bytes = response.content
46+
47+
decoder = VideoDecoder(raw_video_bytes)
48+
frames = decoder.get_frames_in_range(0, 60).data # Get first 60 frames
49+
frame_rate = decoder.metadata.average_fps
50+
51+
# %%
52+
# .. _creating_encoder:
53+
#
54+
# Creating an encoder
55+
# -------------------
56+
#
57+
# Let's instantiate a :class:`~torchcodec.encoders.VideoEncoder`. We will need to provide
58+
# the frames to be encoded as a 4D tensor of shape
59+
# ``(num_frames, num_channels, height, width)`` with values in the ``[0, 255]``
60+
# range and ``torch.uint8`` dtype. We will also need to provide the frame rate of the input
61+
# video.
62+
#
63+
# .. note::
64+
#
65+
# The ``frame_rate`` parameter corresponds to the frame rate of the
66+
# *input* video. It will also be used for the frame rate of the *output* encoded video.
67+
from torchcodec.encoders import VideoEncoder
68+
69+
print(f"{frames.shape = }, {frames.dtype = }")
70+
print(f"{frame_rate = } fps")
71+
72+
encoder = VideoEncoder(frames=frames, frame_rate=frame_rate)
73+
74+
# %%
75+
# Encoding to file, bytes, or file-like
76+
# -------------------------------------
77+
#
78+
# :class:`~torchcodec.encoders.VideoEncoder` supports encoding frames into a
79+
# file via the :meth:`~torchcodec.encoders.VideoEncoder.to_file` method, to
80+
# file-like objects via the :meth:`~torchcodec.encoders.VideoEncoder.to_file_like`
81+
# method, or to raw bytes via :meth:`~torchcodec.encoders.VideoEncoder.to_tensor`.
82+
# For now we will use :meth:`~torchcodec.encoders.VideoEncoder.to_tensor`, so we
83+
# can easily inspect and display the encoded video.
84+
85+
encoded_frames = encoder.to_tensor(format="mp4")
86+
play_video(encoded_frames)
87+
88+
# %%
89+
#
90+
# Now that we have encoded data, we can decode it back to verify the
91+
# round-trip encode/decode process works as expected:
92+
93+
decoder_verify = VideoDecoder(encoded_frames)
94+
decoded_frames = decoder_verify.get_frames_in_range(0, 60).data
95+
96+
print(f"Re-decoded video: {decoded_frames.shape = }")
97+
print(f"Original frames: {frames.shape = }")
98+
99+
# %%
100+
# .. _codec_selection:
101+
#
102+
# Codec Selection
103+
# ---------------
104+
#
105+
# By default, the codec used is selected automatically using the file extension provided
106+
# in the ``dest`` parameter for the :meth:`~torchcodec.encoders.VideoEncoder.to_file` method,
107+
# or using the ``format`` parameter for the
108+
# :meth:`~torchcodec.encoders.VideoEncoder.to_file_like` and
109+
# :meth:`~torchcodec.encoders.VideoEncoder.to_tensor` methods.
110+
#
111+
# For example, when encoding to MP4 format, the default codec is typically ``H.264``.
112+
#
113+
# To use a codec other than the default, use the ``codec`` parameter.
114+
# You can specify either a specific codec implementation (e.g., ``"libx264"``)
115+
# or a codec specification (e.g., ``"h264"``). Different codecs offer
116+
# different tradeoffs between quality, file size, and encoding speed.
117+
#
118+
# .. note::
119+
#
120+
# To see available encoders on your system, run ``ffmpeg -encoders``.
121+
#
122+
# Let's encode the same frames using different codecs:
123+
124+
import tempfile
125+
from pathlib import Path
126+
127+
# H.264 encoding
128+
h264_output = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
129+
encoder.to_file(h264_output, codec="libx264")
130+
131+
# H.265 encoding
132+
hevc_output = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
133+
encoder.to_file(hevc_output, codec="hevc")
134+
135+
# Now let's use ffprobe to verify the codec used in the output files
136+
import subprocess
137+
138+
for output, name in [(h264_output, "h264_output"), (hevc_output, "hevc_output")]:
139+
result = subprocess.run(
140+
[
141+
"ffprobe",
142+
"-v",
143+
"error",
144+
"-select_streams",
145+
"v:0",
146+
"-show_entries",
147+
"stream=codec_name",
148+
"-of",
149+
"default=noprint_wrappers=1:nokey=1",
150+
output,
151+
],
152+
capture_output=True,
153+
text=True,
154+
)
155+
print(f"Codec used in {name}: {result.stdout.strip()}")
156+
157+
158+
# %%
159+
# .. _pixel_format:
160+
#
161+
# Pixel Format
162+
# ------------
163+
#
164+
# The ``pixel_format`` parameter controls the color sampling (chroma subsampling)
165+
# of the output video. This affects both quality and file size.
166+
#
167+
# Common pixel formats:
168+
#
169+
# - ``"yuv420p"`` - 4:2:0 chroma subsampling (standard quality, smaller file size, widely compatible)
170+
# - ``"yuv444p"`` - 4:4:4 chroma subsampling (full chroma resolution, higher quality, larger file size)
171+
#
172+
# Most playback devices and platforms support ``yuv420p``, making it the most
173+
# common choice for video encoding.
174+
#
175+
# .. note::
176+
#
177+
# Pixel format support depends on the codec used. Use ``ffmpeg -h encoder=<codec_name>``
178+
# to check available options for your selected codec.
179+
180+
# Standard pixel format
181+
yuv420_encoded_frames = encoder.to_tensor(
182+
format="mp4", codec="libx264", pixel_format="yuv420p"
183+
)
184+
play_video(yuv420_encoded_frames)
185+
186+
# %%
187+
# .. _crf:
188+
#
189+
# CRF (Constant Rate Factor)
190+
# --------------------------
191+
#
192+
# The ``crf`` parameter controls video quality, where lower values produce higher quality output.
193+
#
194+
# For example, with the commonly used H.264 codec, ``libx264``:
195+
#
196+
# - Values range from 0 (lossless) to 51 (worst quality)
197+
# - Values 17 or 18 are considered visually lossless, and the default is 23.
198+
#
199+
# .. note::
200+
#
201+
# The range and interpretation of CRF values depend on the codec used, and
202+
# not all codecs support CRF. Use ``ffmpeg -h encoder=<codec_name>`` to
203+
# check available options for your selected codec.
204+
#
205+
206+
# High quality (low CRF)
207+
high_quality_output = encoder.to_tensor(format="mp4", codec="libx264", crf=0)
208+
play_video(high_quality_output)
209+
210+
# %%
211+
# Low quality (high CRF)
212+
low_quality_output = encoder.to_tensor(format="mp4", codec="libx264", crf=50)
213+
play_video(low_quality_output)
214+
215+
216+
# %%
217+
# .. _preset:
218+
#
219+
# Preset
220+
# ------
221+
#
222+
# The ``preset`` parameter controls the tradeoff between encoding speed and file compression.
223+
# Faster presets encode faster but produce larger files, while slower
224+
# presets take more time to encode but result in better compression.
225+
#
226+
# For example, with the commonly used H.264 codec, ``libx264`` presets include
227+
# ``"ultrafast"`` (fastest), ``"fast"``, ``"medium"`` (default), ``"slow"``, and
228+
# ``"veryslow"`` (slowest, best compression). See the
229+
# `H.264 Video Encoding Guide <https://trac.ffmpeg.org/wiki/Encode/H.264#a2.Chooseapresetandtune>`_
230+
# for additional details.
231+
#
232+
# .. note::
233+
#
234+
# Not all codecs support the ``presets`` option. Use ``ffmpeg -h encoder=<codec_name>``
235+
# to check available options for your selected codec.
236+
#
237+
238+
# Fast encoding with a larger file size
239+
fast_output = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
240+
encoder.to_file(fast_output, codec="libx264", preset="ultrafast")
241+
print(f"Size of fast encoded file: {Path(fast_output).stat().st_size} bytes")
242+
243+
# Slow encoding for a smaller file size
244+
slow_output = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
245+
encoder.to_file(slow_output, codec="libx264", preset="veryslow")
246+
print(f"Size of slow encoded file: {Path(slow_output).stat().st_size} bytes")
247+
248+
# %%
249+
# .. _extra_options:
250+
#
251+
# Extra Options
252+
# -------------
253+
#
254+
# The ``extra_options`` parameter accepts a dictionary of codec-specific options
255+
# that would normally be set via FFmpeg command-line arguments. This enables
256+
# control of encoding settings beyond the common parameters.
257+
#
258+
# For example, some potential extra options for the commonly used H.264 codec, ``libx264`` include:
259+
#
260+
# - ``"g"`` - GOP (Group of Pictures) size / keyframe interval
261+
# - ``"max_b_frames"`` - Maximum number of B-frames between I and P frames
262+
# - ``"tune"`` - Tuning preset (e.g., ``"film"``, ``"animation"``, ``"grain"``)
263+
#
264+
# .. note::
265+
#
266+
# Use ``ffmpeg -h encoder=<codec_name>`` to see all available options for
267+
# a specific codec.
268+
#
269+
270+
271+
custom_output = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
272+
encoder.to_file(
273+
custom_output,
274+
codec="libx264",
275+
extra_options={
276+
"g": 50, # Keyframe every 50 frames
277+
"max_b_frames": 0, # Disable B-frames for faster decoding
278+
"tune": "fastdecode", # Optimize for fast decoding
279+
}
280+
)
281+
282+
# %%

src/torchcodec/_core/Encoder.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ VideoEncoder::~VideoEncoder() {
662662

663663
VideoEncoder::VideoEncoder(
664664
const torch::Tensor& frames,
665-
int frameRate,
665+
double frameRate,
666666
std::string_view fileName,
667667
const VideoStreamOptions& videoStreamOptions)
668668
: frames_(validateFrames(frames)), inFrameRate_(frameRate) {
@@ -694,7 +694,7 @@ VideoEncoder::VideoEncoder(
694694

695695
VideoEncoder::VideoEncoder(
696696
const torch::Tensor& frames,
697-
int frameRate,
697+
double frameRate,
698698
std::string_view formatName,
699699
std::unique_ptr<AVIOContextHolder> avioContextHolder,
700700
const VideoStreamOptions& videoStreamOptions)
@@ -787,9 +787,9 @@ void VideoEncoder::initializeEncoder(
787787
avCodecContext_->width = outWidth_;
788788
avCodecContext_->height = outHeight_;
789789
avCodecContext_->pix_fmt = outPixelFormat_;
790-
// TODO-VideoEncoder: Verify that frame_rate and time_base are correct
791-
avCodecContext_->time_base = {1, inFrameRate_};
792-
avCodecContext_->framerate = {inFrameRate_, 1};
790+
// TODO-VideoEncoder: Add and utilize output frame_rate option
791+
avCodecContext_->framerate = av_d2q(inFrameRate_, INT_MAX);
792+
avCodecContext_->time_base = av_inv_q(avCodecContext_->framerate);
793793

794794
// Set flag for containers that require extradata to be in the codec context
795795
if (avFormatContext_->oformat->flags & AVFMT_GLOBALHEADER) {
@@ -833,6 +833,10 @@ void VideoEncoder::initializeEncoder(
833833

834834
// Set the stream time base to encode correct frame timestamps
835835
avStream_->time_base = avCodecContext_->time_base;
836+
// Set the stream frame rate to store correct frame durations for some
837+
// containers (webm, mkv)
838+
avStream_->r_frame_rate = avCodecContext_->framerate;
839+
836840
status = avcodec_parameters_from_context(
837841
avStream_->codecpar, avCodecContext_.get());
838842
TORCH_CHECK(

src/torchcodec/_core/Encoder.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,13 @@ class VideoEncoder {
143143

144144
VideoEncoder(
145145
const torch::Tensor& frames,
146-
int frameRate,
146+
double frameRate,
147147
std::string_view fileName,
148148
const VideoStreamOptions& videoStreamOptions);
149149

150150
VideoEncoder(
151151
const torch::Tensor& frames,
152-
int frameRate,
152+
double frameRate,
153153
std::string_view formatName,
154154
std::unique_ptr<AVIOContextHolder> avioContextHolder,
155155
const VideoStreamOptions& videoStreamOptions);
@@ -172,7 +172,7 @@ class VideoEncoder {
172172
UniqueSwsContext swsContext_;
173173

174174
const torch::Tensor frames_;
175-
int inFrameRate_;
175+
double inFrameRate_;
176176

177177
int inWidth_ = -1;
178178
int inHeight_ = -1;

src/torchcodec/_core/SingleStreamDecoder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,7 @@ AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
10391039
firstFramePtsSeconds = frameOutput.ptsSeconds;
10401040
}
10411041
frames.push_back(frameOutput.data);
1042-
} catch (const EndOfFileException& e) {
1042+
} catch (const EndOfFileException&) {
10431043
finished = true;
10441044
}
10451045

0 commit comments

Comments
 (0)