diff --git a/benchmarks/encoders/benchmark_encoders.py b/benchmarks/encoders/benchmark_encoders.py
new file mode 100644
index 000000000..5b34c5d0e
--- /dev/null
+++ b/benchmarks/encoders/benchmark_encoders.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+import subprocess
+import tempfile
+from argparse import ArgumentParser
+from pathlib import Path
+from time import perf_counter_ns
+
+import psutil
+import torch
+from torchcodec.decoders import VideoDecoder
+from torchcodec.encoders import VideoEncoder
+
+# GPU monitoring imports (install with: pip install nvidia-ml-py)
+try:
+    import pynvml
+
+    GPU_MONITORING_AVAILABLE = True
+except ImportError:
+    print("To enable GPU monitoring, install pynvml with: pip install nvidia-ml-py")
+    GPU_MONITORING_AVAILABLE = False
+
+DEFAULT_VIDEO_PATH = "test/resources/nasa_13013.mp4"
+# Alternatively, run this command to generate a longer test video:
+#   ffmpeg -f lavfi -i testsrc2=duration=600:size=1280x720:rate=30 -c:v libx264 -pix_fmt yuv420p test/resources/testsrc2_10min.mp4
+# DEFAULT_VIDEO_PATH = "test/resources/testsrc2_10min.mp4"
+DEFAULT_AVERAGE_OVER = 30
+DEFAULT_MAX_FRAMES = 300
+
+
+def gpu_percent():
+    if not GPU_MONITORING_AVAILABLE:
+        return 0.0
+    try:
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+        return float(util.gpu)
+    except Exception:
+        return 0.0
+
+
+def bench(f, average_over=50, warmup=2, **f_kwargs):
+    for _ in range(warmup):
+        f(**f_kwargs)
+
+    times = []
+    cpu_utils = []
+    gpu_utils = []
+
+    for _ in range(average_over):
+        psutil.cpu_percent(interval=None)
+
+        start = perf_counter_ns()
+        f(**f_kwargs)
+        end = perf_counter_ns()
+
+        cpu_util = psutil.cpu_percent(interval=None)
+        gpu_util = gpu_percent()
+
+        times.append(end - start)
+        cpu_utils.append(cpu_util)
+        gpu_utils.append(gpu_util)
+
+    times_tensor = torch.tensor(times).float()
+    cpu_tensor = torch.tensor(cpu_utils).float()
+    gpu_tensor = torch.tensor(gpu_utils).float()
+
+    return times_tensor, cpu_tensor, gpu_tensor
+
+
+def report_stats(
+    times, num_frames, cpu_utils=None, gpu_utils=None, prefix="", unit="ms"
+):
+    mul = {
+        "ns": 1,
+        "µs": 1e-3,
+        "ms": 1e-6,
+        "s": 1e-9,
+    }[unit]
+    unit_times = times * mul
+    std = unit_times.std().item()
+    med = unit_times.median().item()
+    mean = unit_times.mean().item()
+    min_time = unit_times.min().item()
+    max_time = unit_times.max().item()
+    print(
+        f"\n{prefix}   {med = :.2f}, {mean = :.2f} +- {std:.2f}, {min_time = :.2f}, {max_time = :.2f} - in {unit}"
+    )
+    if cpu_utils is not None:
+        cpu_avg = cpu_utils.mean().item()
+        cpu_peak = cpu_utils.max().item()
+        print(f"CPU utilization:      avg = {cpu_avg:.1f}%, peak = {cpu_peak:.1f}%")
+
+    if gpu_utils is not None and gpu_utils.numel() > 0:
+        gpu_avg = gpu_utils.mean().item()
+        gpu_peak = gpu_utils.max().item()
+        print(f"GPU utilization:      avg = {gpu_avg:.1f}%, peak = {gpu_peak:.1f}%")
+
+
+def encode_torchcodec(frames, output_path, device="cpu"):
+    encoder = VideoEncoder(frames=frames, frame_rate=30)
+    if device == "cuda":
+        encoder.to_file(dest=output_path, codec="h264_nvenc", extra_options={"qp": 1})
+    else:
+        encoder.to_file(dest=output_path, codec="libx264", crf=0)
+
+
+def write_raw_frames(frames, num_frames, raw_path):
+    # Convert NCHW to NHWC for raw video format
+    raw_frames = frames.permute(0, 2, 3, 1).contiguous()[:num_frames]
+    with open(raw_path, "wb") as f:
+        f.write(raw_frames.cpu().numpy().tobytes())
+
+
+def write_and_encode_ffmpeg_cli(
+    frames, num_frames, raw_path, output_path, device="cpu", write_frames=False
+):
+    # Rewrite frames during benchmarking function if write_frames flag used
+    if write_frames:
+        write_raw_frames(frames, num_frames, raw_path)
+    height, width = frames.shape[2], frames.shape[3]
+
+    if device == "cuda":
+        codec = "h264_nvenc"
+        quality_params = ["-qp", "0"]
+    else:
+        codec = "libx264"
+        quality_params = ["-crf", "0"]
+
+    ffmpeg_cmd = [
+        "ffmpeg",
+        "-y",
+        "-f",
+        "rawvideo",
+        "-pix_fmt",
+        "rgb24",
+        "-s",
+        f"{width}x{height}",
+        "-r",
+        "30",  # frame_rate is 30
+        "-i",
+        raw_path,
+        "-c:v",
+        codec,
+        "-pix_fmt",
+        "yuv420p",
+    ]
+    ffmpeg_cmd.extend(quality_params)
+    # By not setting threads, allow FFmpeg to choose.
+    # ffmpeg_cmd.extend(["-threads", "1"])
+    # try setting threads on VideoEncoder too?
+    ffmpeg_cmd.extend([str(output_path)])
+
+    subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--path", type=str, help="Path to input video file", default=DEFAULT_VIDEO_PATH
+    )
+    parser.add_argument(
+        "--average-over",
+        type=int,
+        default=DEFAULT_AVERAGE_OVER,
+        help="Number of runs to average over",
+    )
+    parser.add_argument(
+        "--max-frames",
+        type=int,
+        default=DEFAULT_MAX_FRAMES,
+        help="Maximum number of frames to decode for benchmarking",
+    )
+    parser.add_argument(
+        "--write-frames",
+        action="store_true",
+        help="Include raw frame writing time in FFmpeg CLI benchmarks for fairer comparison with tensor-based workflows",
+    )
+
+    args = parser.parse_args()
+
+    print(
+        f"Benchmarking up to {args.max_frames} frames from {Path(args.path).name} over {args.average_over} runs:"
+    )
+    cuda_available = torch.cuda.is_available()
+    if not cuda_available:
+        print("CUDA not available. GPU benchmarks will be skipped.")
+
+    #  Load up to max_frames frames
+    decoder = VideoDecoder(str(args.path))
+    frames = decoder.get_frames_in_range(
+        start=0, stop=min(args.max_frames, len(decoder))
+    ).data
+    gpu_frames = frames.cuda()
+    print(
+        f"Loaded {frames.shape[0]} frames of size {frames.shape[2]}x{frames.shape[3]}"
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_dir = Path(temp_dir)
+        raw_frames_path = temp_dir / "input_frames.raw"
+        write_raw_frames(frames, args.max_frames, str(raw_frames_path))
+
+        # Benchmark torchcodec on GPU
+        if cuda_available:
+            gpu_output = temp_dir / "torchcodec_gpu.mp4"
+            times, _cpu_utils, gpu_utils = bench(
+                encode_torchcodec,
+                frames=gpu_frames,
+                output_path=str(gpu_output),
+                device="cuda",
+                average_over=args.average_over,
+                warmup=1,
+            )
+            report_stats(
+                times, frames.shape[0], None, gpu_utils, prefix="VideoEncoder on GPU"
+            )
+        else:
+            print("Skipping VideoEncoder GPU benchmark (CUDA not available)")
+
+        # Benchmark FFmpeg CLI on GPU
+        if cuda_available:
+            ffmpeg_gpu_output = temp_dir / "ffmpeg_gpu.mp4"
+            times, _cpu_utils, gpu_utils = bench(
+                write_and_encode_ffmpeg_cli,
+                frames=gpu_frames,
+                num_frames=args.max_frames,
+                raw_path=str(raw_frames_path),
+                output_path=str(ffmpeg_gpu_output),
+                device="cuda",
+                write_frames=args.write_frames,
+                average_over=args.average_over,
+                warmup=1,
+            )
+            prefix = "FFmpeg CLI on GPU  "
+            report_stats(times, frames.shape[0], None, gpu_utils, prefix=prefix)
+        else:
+            print("Skipping FFmpeg CLI GPU benchmark (CUDA not available)")
+
+        # Benchmark torchcodec on CPU
+        cpu_output = temp_dir / "torchcodec_cpu.mp4"
+        times, cpu_utils, _gpu_utils = bench(
+            encode_torchcodec,
+            frames=frames,
+            output_path=str(cpu_output),
+            device="cpu",
+            average_over=args.average_over,
+            warmup=1,
+        )
+        report_stats(
+            times, frames.shape[0], cpu_utils, None, prefix="VideoEncoder on CPU"
+        )
+
+        # Benchmark FFmpeg CLI on CPU
+        ffmpeg_cpu_output = temp_dir / "ffmpeg_cpu.mp4"
+        times, cpu_utils, _gpu_utils = bench(
+            write_and_encode_ffmpeg_cli,
+            frames=frames,
+            num_frames=args.max_frames,
+            raw_path=str(raw_frames_path),
+            output_path=str(ffmpeg_cpu_output),
+            device="cpu",
+            write_frames=args.write_frames,
+            average_over=args.average_over,
+            warmup=1,
+        )
+        prefix = "FFmpeg CLI on CPU  "
+        report_stats(times, frames.shape[0], cpu_utils, None, prefix=prefix)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
index 4fc7af75e..9b7f18b8f 100644
--- a/src/torchcodec/_core/CMakeLists.txt
+++ b/src/torchcodec/_core/CMakeLists.txt
@@ -100,7 +100,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)
     endif()
 
     set(core_library_dependencies
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
index 19ac9220d..2569a72ab 100644
--- a/src/torchcodec/_core/Encoder.cpp
+++ b/src/torchcodec/_core/Encoder.cpp
@@ -724,6 +724,9 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
+  if (frames_.device().is_cuda()) {
+    gpuEncoder_ = std::make_unique<GpuEncoder>(frames_.device());
+  }
   const AVCodec* avCodec = nullptr;
   // If codec arg is provided, find codec using logic similar to FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
@@ -820,6 +823,12 @@ void VideoEncoder::initializeEncoder(
         videoStreamOptions.preset.value().c_str(),
         0);
   }
+
+  if (gpuEncoder_) {
+    gpuEncoder_->registerHardwareDeviceWithCodec(avCodecContext_.get());
+    gpuEncoder_->setupHardwareFrameContext(avCodecContext_.get());
+  }
+
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
   av_dict_free(&avCodecOptions);
 
@@ -860,7 +869,13 @@ void VideoEncoder::encode() {
   int numFrames = static_cast<int>(frames_.sizes()[0]);
   for (int i = 0; i < numFrames; ++i) {
     torch::Tensor currFrame = frames_[i];
-    UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
+    UniqueAVFrame avFrame;
+    if (gpuEncoder_) {
+      avFrame = gpuEncoder_->convertTensorToAVFrame(
+          currFrame, outPixelFormat_, i, avCodecContext_.get());
+    } else {
+      avFrame = convertTensorToAVFrame(currFrame, i);
+    }
     encodeFrame(autoAVPacket, avFrame);
   }
 
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
index 1bdc1e443..fe3284737 100644
--- a/src/torchcodec/_core/Encoder.h
+++ b/src/torchcodec/_core/Encoder.h
@@ -3,7 +3,9 @@
 #include <map>
 #include <string>
 #include "AVIOContextHolder.h"
+#include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
+#include "GpuEncoder.h"
 #include "StreamOptions.h"
 
 extern "C" {
@@ -183,6 +185,7 @@ class VideoEncoder {
   AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
 
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
+  std::unique_ptr<GpuEncoder> gpuEncoder_;
 
   bool encodeWasCalled_ = false;
   AVDictionary* avFormatOptions_ = nullptr;
diff --git a/src/torchcodec/_core/GpuEncoder.cpp b/src/torchcodec/_core/GpuEncoder.cpp
new file mode 100644
index 000000000..5e80a1e06
--- /dev/null
+++ b/src/torchcodec/_core/GpuEncoder.cpp
@@ -0,0 +1,219 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "GpuEncoder.h"
+
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include "CUDACommon.h"
+#include "FFMPEGCommon.h"
+
+extern "C" {
+#include <libavutil/hwcontext_cuda.h>
+#include <libavutil/pixdesc.h>
+}
+
+namespace facebook::torchcodec {
+namespace {
+
+// Redefinition from CudaDeviceInterface.cpp anonymous namespace
+int getFlagsAVHardwareDeviceContextCreate() {
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
+  return AV_CUDA_USE_CURRENT_CONTEXT;
+#else
+  return 0;
+#endif
+}
+
+// Redefinition from CudaDeviceInterface.cpp anonymous namespace
+// TODO-VideoEncoder: unify device context creation, add caching to encoder
+UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) {
+  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
+  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
+
+  int deviceIndex = getDeviceIndex(device);
+
+  c10::cuda::CUDAGuard deviceGuard(device);
+  // We set the device because we may be called from a different thread than
+  // the one that initialized the cuda context.
+  TORCH_CHECK(
+      cudaSetDevice(deviceIndex) == cudaSuccess, "Failed to set CUDA device");
+
+  AVBufferRef* hardwareDeviceCtxRaw = nullptr;
+  std::string deviceOrdinal = std::to_string(deviceIndex);
+
+  int err = av_hwdevice_ctx_create(
+      &hardwareDeviceCtxRaw,
+      type,
+      deviceOrdinal.c_str(),
+      nullptr,
+      getFlagsAVHardwareDeviceContextCreate());
+
+  if (err < 0) {
+    /* clang-format off */
+    TORCH_CHECK(
+        false,
+        "Failed to create specified HW device. This typically happens when ",
+        "your installed FFmpeg doesn't support CUDA (see ",
+        "https://github.com/pytorch/torchcodec#installing-cuda-enabled-torchcodec",
+        "). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
+    /* clang-format on */
+  }
+
+  return UniqueAVBufferRef(hardwareDeviceCtxRaw);
+}
+
+// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
+// Note: NPP's ColorTwist function apparently expects "limited range"
+// coefficient format even when producing full range output. The matrix below
+// uses the limited range coefficient format (Y with +16 offset) for NPP
+// compatibility.
+
+// BT.601 limited range (matches FFmpeg default behavior)
+const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
+    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
+    {0.257f, 0.504f, 0.098f, 16.0f},
+    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
+    {-0.148f, -0.291f, 0.439f, 128.0f},
+    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
+    {0.439f, -0.368f, -0.071f, 128.0f}};
+} // namespace
+
+GpuEncoder::GpuEncoder(const torch::Device& device) : device_(device) {
+  TORCH_CHECK(
+      device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
+
+  initializeCudaContextWithPytorch(device_);
+  initializeHardwareContext();
+}
+
+GpuEncoder::~GpuEncoder() {}
+
+void GpuEncoder::initializeHardwareContext() {
+  hardwareDeviceCtx_ = createHardwareDeviceContext(device_);
+  nppCtx_ = getNppStreamContext(device_);
+}
+
+void GpuEncoder::registerHardwareDeviceWithCodec(AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+  codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
+}
+
+// Allocates and initializes AVHWFramesContext, and sets pixel format fields
+// to enable encoding with CUDA device. The hw_frames_ctx field is needed by
+// FFmpeg to allocate frames on GPU's memory.
+void GpuEncoder::setupHardwareFrameContext(AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+
+  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
+  TORCH_CHECK(
+      hwFramesCtxRef != nullptr,
+      "Failed to allocate hardware frames context for codec");
+
+  // Always set pixel formats to options that support CUDA encoding.
+  // TODO-VideoEncoder: Enable user set pixel formats to be set and properly
+  // handled with NPP functions below
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
+
+  AVHWFramesContext* hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
+  hwFramesCtx->format = codecContext->pix_fmt;
+  hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
+  hwFramesCtx->width = codecContext->width;
+  hwFramesCtx->height = codecContext->height;
+
+  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
+  if (ret < 0) {
+    av_buffer_unref(&hwFramesCtxRef);
+    TORCH_CHECK(
+        false,
+        "Failed to initialize CUDA frames context for codec: ",
+        getFFMPEGErrorStringFromErrorCode(ret));
+  }
+
+  codecContext->hw_frames_ctx = hwFramesCtxRef;
+}
+
+UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
+    const torch::Tensor& tensor,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    int frameIndex,
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      tensor.dim() == 3 && tensor.size(0) == 3,
+      "Expected 3D RGB tensor (CHW format), got shape: ",
+      tensor.sizes());
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+  int height = static_cast<int>(tensor.size(1));
+  int width = static_cast<int>(tensor.size(2));
+
+  // TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
+  avFrame->format = AV_PIX_FMT_CUDA;
+  avFrame->height = height;
+  avFrame->width = width;
+  avFrame->pts = frameIndex;
+
+  // FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
+  // TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
+  // efficiency
+  int ret =
+      av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to allocate hardware frame: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+  TORCH_CHECK(
+      avFrame != nullptr && avFrame->data[0] != nullptr,
+      "avFrame must be pre-allocated with CUDA memory");
+
+  torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
+
+  at::cuda::CUDAStream currentStream =
+      at::cuda::getCurrentCUDAStream(device_.index());
+
+  nppCtx_->hStream = currentStream.stream();
+  cudaError_t cudaErr =
+      cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
+  TORCH_CHECK(
+      cudaErr == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(cudaErr));
+
+  NppiSize oSizeROI = {width, height};
+  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
+      avFrame->data,
+      avFrame->linesize,
+      oSizeROI,
+      defaultLimitedRangeRgbToNv12,
+      *nppCtx_);
+
+  TORCH_CHECK(
+      status == NPP_SUCCESS,
+      "Failed to convert RGB to NV12: NPP error code ",
+      status);
+
+  // TODO-VideoEncoder: Enable configuration of color properties, similar to
+  // FFmpeg. Below are the default color properties used by FFmpeg.
+  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
+  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+
+  return avFrame;
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/GpuEncoder.h b/src/torchcodec/_core/GpuEncoder.h
new file mode 100644
index 000000000..a5a6ad68c
--- /dev/null
+++ b/src/torchcodec/_core/GpuEncoder.h
@@ -0,0 +1,51 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/types.h>
+#include <memory>
+#include <optional>
+
+#include "CUDACommon.h"
+#include "FFMPEGCommon.h"
+#include "StreamOptions.h"
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/buffer.h>
+#include <libavutil/hwcontext.h>
+}
+
+namespace facebook::torchcodec {
+
+class GpuEncoder {
+ public:
+  explicit GpuEncoder(const torch::Device& device);
+  ~GpuEncoder();
+
+  void registerHardwareDeviceWithCodec(AVCodecContext* codecContext);
+  void setupHardwareFrameContext(AVCodecContext* codecContext);
+
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext);
+
+  const torch::Device& device() const {
+    return device_;
+  }
+
+ private:
+  torch::Device device_;
+  UniqueAVBufferRef hardwareDeviceCtx_;
+  UniqueNppContext nppCtx_;
+
+  void initializeHardwareContext();
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
index ce0f27d3b..9faafb502 100644
--- a/src/torchcodec/_core/StreamOptions.h
+++ b/src/torchcodec/_core/StreamOptions.h
@@ -41,6 +41,8 @@ struct VideoStreamOptions {
       ColorConversionLibrary::FILTERGRAPH;
 
   // By default we use CPU for decoding for both C++ and python users.
+  // Note: For video encoding, device is determined by the location of the input
+  // frame tensor.
   torch::Device device = torch::kCPU;
   // Device variant (e.g., "ffmpeg", "beta", etc.)
   std::string_view deviceVariant = "ffmpeg";
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 4ec72974d..7030928a5 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -1019,6 +1019,9 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("_create_from_file_like", &_create_from_file_like);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
+  m.impl("encode_video_to_file", &encode_video_to_file);
+  m.impl("encode_video_to_tensor", &encode_video_to_tensor);
+  m.impl("_encode_video_to_file_like", &_encode_video_to_file_like);
 }
 
 TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index 921f5ee54..8dd50c99d 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -222,7 +222,7 @@ def encode_video_to_file_like(
     """Encode video frames to a file-like object.
 
     Args:
-        frames: Video frames tensor
+        frames: Video frames tensor. The device of the frames tensor will be used for encoding.
         frame_rate: Frame rate in frames per second
         format: Video format (e.g., "mp4", "mov", "mkv")
         file_like: File-like object that supports write() and seek() methods
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
index 3fede6b8e..a240052a6 100644
--- a/src/torchcodec/encoders/_video_encoder.py
+++ b/src/torchcodec/encoders/_video_encoder.py
@@ -15,6 +15,7 @@ class VideoEncoder:
             tensor of shape ``(N, C, H, W)`` where N is the number of frames,
             C is 3 channels (RGB), H is height, and W is width.
             Values must be uint8 in the range ``[0, 255]``.
+            The device of the frames tensor will be used for encoding.
         frame_rate (float): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate.
     """
 
diff --git a/test/test_encoders.py b/test/test_encoders.py
index 543025599..7761daf5a 100644
--- a/test/test_encoders.py
+++ b/test/test_encoders.py
@@ -796,15 +796,22 @@ def test_extra_options_errors(self, method, tmp_path, extra_options, error):
             getattr(encoder, method)(**valid_params, extra_options=extra_options)
 
     @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
-    def test_contiguity(self, method, tmp_path):
+    @pytest.mark.parametrize(
+        "device", ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
+    )
+    def test_contiguity(self, method, tmp_path, device):
         # Ensure that 2 sets of video frames with the same pixel values are encoded
         # in the same way, regardless of their memory layout. Here we encode 2 equal
         # frame tensors, one is contiguous while the other is non-contiguous.
 
-        num_frames, channels, height, width = 5, 3, 64, 64
-        contiguous_frames = torch.randint(
-            0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8
-        ).contiguous()
+        num_frames, channels, height, width = 5, 3, 256, 256
+        contiguous_frames = (
+            torch.randint(
+                0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8
+            )
+            .contiguous()
+            .to(device)
+        )
         assert contiguous_frames.is_contiguous()
 
         # Permute NCHW to NHWC, then update the memory layout, then permute back
@@ -820,7 +827,11 @@ def test_contiguity(self, method, tmp_path):
         )
 
         def encode_to_tensor(frames):
-            common_params = dict(crf=0, pixel_format="yuv444p")
+            common_params = dict(
+                crf=0,
+                pixel_format="yuv444p",
+                codec="h264_nvenc" if device != "cpu" else None,
+            )
             if method == "to_file":
                 dest = str(tmp_path / "output.mp4")
                 VideoEncoder(frames, frame_rate=30).to_file(dest=dest, **common_params)
@@ -1259,3 +1270,105 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
         assert metadata["profile"].lower() == expected_profile
         assert metadata["color_space"] == colorspace
         assert metadata["color_range"] == color_range
+
+    @pytest.mark.needs_cuda
+    @pytest.mark.skipif(in_fbcode(), reason="ffmpeg CLI not available")
+    @pytest.mark.parametrize(
+        "format_codec",
+        [
+            ("mov", "h264_nvenc"),
+            ("mp4", "hevc_nvenc"),
+            ("avi", "h264_nvenc"),
+            # ("mkv", "av1_nvenc"), # av1_nvenc is not supported on CI
+        ],
+    )
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
+    # TODO-VideoEncoder: Enable additional pixel formats ("yuv420p", "yuv444p")
+    def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
+        # Encode with FFmpeg CLI using nvenc codecs
+        format, codec = format_codec
+        device = "cuda"
+        pixel_format = "nv12"
+        qp = 1  # Lossless (qp=0) is not supported on av1_nvenc, so we use 1
+        source_frames = self.decode(TEST_SRC_2_720P.path).data.to(device)
+
+        temp_raw_path = str(tmp_path / "temp_input.raw")
+        with open(temp_raw_path, "wb") as f:
+            f.write(source_frames.permute(0, 2, 3, 1).cpu().numpy().tobytes())
+
+        ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_nvenc_output.{format}")
+        frame_rate = 30
+
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-y",
+            "-f",
+            "rawvideo",
+            "-pix_fmt",
+            "rgb24",  # Input format
+            "-s",
+            f"{source_frames.shape[3]}x{source_frames.shape[2]}",
+            "-r",
+            str(frame_rate),
+            "-i",
+            temp_raw_path,
+            "-c:v",
+            codec,  # Use specified NVENC hardware encoder
+        ]
+
+        ffmpeg_cmd.extend(["-pix_fmt", pixel_format])  # Output format
+        if codec == "av1_nvenc":
+            ffmpeg_cmd.extend(["-rc", "constqp"])  # Set rate control mode for AV1
+        ffmpeg_cmd.extend(["-qp", str(qp)])  # Use lossless qp for other codecs
+        ffmpeg_cmd.extend([ffmpeg_encoded_path])
+
+        # TODO-VideoEncoder: Ensure CI does not skip this test, as we know NVENC is available.
+        try:
+            subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
+        except subprocess.CalledProcessError as e:
+            if b"No NVENC capable devices found" in e.stderr:
+                pytest.skip("NVENC not available on this system")
+            else:
+                raise
+
+        encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
+
+        encoder_extra_options = {"qp": qp}
+        if codec == "av1_nvenc":
+            encoder_extra_options["rc"] = 0  # constqp mode
+        if method == "to_file":
+            encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
+            encoder.to_file(
+                dest=encoder_output_path,
+                codec=codec,
+                pixel_format=pixel_format,
+                extra_options=encoder_extra_options,
+            )
+            encoder_output = encoder_output_path
+        elif method == "to_tensor":
+            encoder_output = encoder.to_tensor(
+                format=format,
+                codec=codec,
+                pixel_format=pixel_format,
+                extra_options=encoder_extra_options,
+            )
+        elif method == "to_file_like":
+            file_like = io.BytesIO()
+            encoder.to_file_like(
+                file_like=file_like,
+                format=format,
+                codec=codec,
+                pixel_format=pixel_format,
+                extra_options=encoder_extra_options,
+            )
+            encoder_output = file_like.getvalue()
+        else:
+            raise ValueError(f"Unknown method: {method}")
+
+        ffmpeg_frames = self.decode(ffmpeg_encoded_path).data
+        encoder_frames = self.decode(encoder_output).data
+
+        assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
+        for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
+            assert psnr(ff_frame, enc_frame) > 25
+            assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=95, atol=2)