[libcu++] Add initial cccl-runtime docs for 3.1 (#6562)

pciolkosz · web-flow · commit 318408d1aa3b · 2025-11-11T16:45:54.000-08:00
* Add initial cccl-runtime docs for 3.1

* Review feedback
diff --git a/docs/libcudacxx/Doxyfile b/docs/libcudacxx/Doxyfile
@@ -10,6 +10,10 @@ XML_OUTPUT             = xml
 XML_PROGRAMLISTING     = YES
 
 INPUT                  = ../../libcudacxx/include/cuda/__iterator \
+                         ../../libcudacxx/include/cuda/__stream \
+                         ../../libcudacxx/include/cuda/__device \
+                         ../../libcudacxx/include/cuda/__event \
+                         ../../libcudacxx/include/cuda/__algorithm \
                          ../../libcudacxx/include/nv
 
 RECURSIVE              = YES
diff --git a/docs/libcudacxx/extended_api.rst b/docs/libcudacxx/extended_api.rst
@@ -18,7 +18,6 @@ Extended API
    extended_api/type_traits
    extended_api/numeric
    extended_api/memory
-   extended_api/streams
    extended_api/memory_resource
    extended_api/math
    extended_api/mdspan
diff --git a/docs/libcudacxx/extended_api/streams.rst b/docs/libcudacxx/extended_api/streams.rst
diff --git a/docs/libcudacxx/extended_api/streams/stream_ref.rst b/docs/libcudacxx/extended_api/streams/stream_ref.rst
diff --git a/docs/libcudacxx/index.rst b/docs/libcudacxx/index.rst
@@ -12,6 +12,7 @@ libcu++
    releases
    standard_api
    extended_api
+   runtime
    ptx_api
    API reference <api/index>
 
diff --git a/docs/libcudacxx/runtime.rst b/docs/libcudacxx/runtime.rst
@@ -0,0 +1,78 @@
+.. _cccl-runtime:
+
+Runtime
+=======
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+
+   runtime/stream
+   runtime/event
+   runtime/algorithm
+   runtime/device
+
+.. list-table::
+   :widths: 25 45 30 30
+   :header-rows: 1
+
+   * - **Header**
+     - **Content**
+     - **CCCL Availability**
+     - **CUDA Toolkit Availability**
+
+   * - :ref:`devices <cccl-runtime-device-devices>`
+     - A range of all available CUDA devices
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`device_ref <cccl-runtime-device-device-ref>`
+     - A non-owning representation of a CUDA device
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`arch_traits <cccl-runtime-device-arch-traits>`
+     - Per-architecture trait accessors
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+
+   * - :ref:`stream_ref <cccl-runtime-stream-stream-ref>`
+     - A non-owning wrapper around a ``cudaStream_t``
+     - CCCL 2.2.0
+     - CUDA 12.3
+
+   * - :ref:`stream <cccl-runtime-stream-stream>`
+     - An owning wrapper around a ``cudaStream_t``
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`event_ref <cccl-runtime-event-event-ref>`
+     - A non-owning wrapper around a ``cudaEvent_t``
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`event <cccl-runtime-event-event>`
+     - An owning wrapper around a ``cudaEvent_t`` (timing disabled)
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`timed_event <cccl-runtime-event-timed-event>`
+     - An owning wrapper around a ``cudaEvent_t`` with timing enabled and elapsed-time queries
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`copy_bytes <cccl-runtime-algorithm-copy_bytes>`
+     - Byte-wise copy into a ``cuda::stream_ref`` for ``cuda::std::span``/``cuda::std::mdspan`` sources and destinations
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`fill_bytes <cccl-runtime-algorithm-fill_bytes>`
+     - Byte-wise fill into a ``cuda::stream_ref`` for ``cuda::std::span``/``cuda::std::mdspan`` destinations
+     - CCCL 3.1.0
+     - CUDA 13.1
+
+   * - :ref:`Memory Resources <libcudacxx-extended-api-memory-resources>`
+     - ``cuda::mr`` interfaces (resources, wrappers, properties) usable with streams
+     - CCCL 2.2.0 (experimental), CCCL 3.1.0 (stable)
+     - CUDA 12.3 (experimental), CUDA 13.1 (stable)
diff --git a/docs/libcudacxx/runtime/algorithm.rst b/docs/libcudacxx/runtime/algorithm.rst
@@ -0,0 +1,56 @@
+.. _cccl-runtime-algorithm:
+
+Algorithm
+==========
+
+The ``runtime`` part of the ``cuda/algorithm`` header provide stream-ordered, byte-wise primitives that operate on ``cuda::std::span`` and
+``cuda::std::mdspan``-compatible types. They require a ``cuda::stream_ref`` to enqueue work.
+
+``cuda::copy_bytes``
+---------------------
+.. _cccl-runtime-algorithm-copy_bytes:
+
+Launch a byte-wise copy from source to destination on the provided stream.
+
+- Overloads accept ``cuda::std::span``-convertible contiguous ranges or ``cuda::std::mdspan``-convertible multi-dimensional views.
+- Elements must be trivially copyable
+- ``cuda::std::mdspan``-convertible types must convert to a mdspan that is exhaustive
+- Source access order (during the copy call or in stream order) can be configured with ``cuda::copy_configuration``
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+.. code:: cpp
+
+   #include <cuda/algorithm>
+   #include <cuda/stream>
+   #include <cuda/std/span>
+
+   void copy_example(cuda::stream_ref s, int* d_dst, const int* d_src, std::size_t n) {
+     cuda::std::span<const int> src{d_src, n};
+     cuda::std::span<int>       dst{d_dst, n};
+     cuda::copy_bytes(s, src, dst);  // enqueued on s
+   }
+
+
+``cuda::fill_bytes``
+---------------------
+.. _cccl-runtime-algorithm-fill_bytes:
+
+Launch a byte-wise fill of the destination on the provided stream.
+
+- Overloads accept ``cuda::std::span``-convertible or ``cuda::std::mdspan``-convertible destinations.
+- Elements must be trivially copyable
+- ``cuda::std::mdspan``-convertible types must convert to a mdspan that is exhaustive
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+.. code:: cpp
+
+   #include <cuda/algorithm>
+   #include <cuda/stream>
+   #include <cuda/std/span>
+
+   void fill_example(cuda::stream_ref s, int* d_dst, std::size_t n) {
+     cuda::std::span<int> dst{d_dst, n};
+     cuda::fill_bytes(s, dst, 0x00); // zero-fill device memory
+   }
diff --git a/docs/libcudacxx/runtime/device.rst b/docs/libcudacxx/runtime/device.rst
@@ -0,0 +1,104 @@
+.. _cccl-runtime-device:
+
+Devices
+=======
+
+``cuda::device_ref``
+---------------------
+.. _cccl-runtime-device-device-ref:
+
+``cuda::device_ref`` is a lightweight, non-owning handle to a CUDA device ordinal.
+It offers:
+
+- ``get()``: native device ordinal
+- ``name()``: device name
+- ``init()``: initialize the device context
+- ``peers()``: list peers for which peer access can be enabled
+- ``has_peer_access_to(device_ref)``: query if peer access can be enabled to the given device
+- ``attribute(attr)`` / ``attribute<::cudaDeviceAttr>()``: attribute queries
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+``cuda::devices``
+------------------
+.. _cccl-runtime-device-devices:
+
+``cuda::devices`` is a random-access view of all available CUDA devices in form of ``cuda::device_ref`` objects`. It provides indexing, size, and iteration for use
+in range-based loops.
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+Example:
+
+.. code:: cpp
+
+   #include <cuda/devices>
+   #include <iostream>
+
+   void print_devices() {
+    for (auto& dev : cuda::devices) {
+      std::cout << "Device " << dev.get() << ": " << dev.name() << std::endl;
+    }
+   }
+
+Device attributes
+-----------------
+.. _cccl-runtime-device-attributes:
+
+``cuda::device_attributes`` provides strongly-typed attribute query objects usable with
+``device_ref::attribute``. Selected examples:
+
+- ``compute_capability``
+- ``multiprocessor_count``
+- ``concurrent_managed_access``
+- ``clock_rate``
+- ``numa_id``
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+Example:
+
+.. code:: cpp
+
+   #include <cuda/devices>
+
+   int get_max_blocks_on_device(cuda::device_ref dev) {
+     return cuda::device_attributes::multiprocessor_count(dev) * cuda::device_attributes::blocks_per_multiprocessor(dev);
+   }
+
+``cuda::arch_traits``
+---------------------
+.. _cccl-runtime-device-arch-traits:
+
+Per-architecture trait accessors providing limits and capabilities common to all devices of an architecture.
+Compared to ``device_attributes``, ``cuda::arch_traits`` provide a compile-time accessible structure that describes common characteristics of all devices of an architecture, while attributes are run-time queries of a single characteristic of a specific device.
+
+- ``cuda::arch_traits<cuda::arch_id::sm_80>()`` (compile-time) or
+  ``cuda::arch_traits_for(cuda::arch_id)`` / ``cuda::arch_traits_for(cuda::compute_capability)`` (run-time).
+- Returns a ``cuda::arch_traits_t`` with fields like
+  ``max_threads_per_block``, ``max_shared_memory_per_block``, ``cluster_supported`` and other capability flags.
+- Traits for the current architecture can be accessed with ``cuda::device::current_arch_traits()``
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+Example:
+
+.. code:: cpp
+
+   #include <cuda/devices>
+
+   template <cuda::arch_id Arch>
+   __device__ void fn() {
+    auto traits = cuda::arch_traits<Arch>();
+     if constexpr (traits.cluster_supported) {
+       // cluster specific code
+     }
+     else {
+       // non-cluster code
+     }
+
+   }
+
+    __global__ void kernel() {
+      fn<cuda::arch_id::sm_90>();
+    }
diff --git a/docs/libcudacxx/runtime/event.rst b/docs/libcudacxx/runtime/event.rst
@@ -0,0 +1,78 @@
+.. _cccl-runtime-event:
+
+Events
+======
+
+``cuda::event_ref``
+--------------------------------------------------
+.. _cccl-runtime-event-event-ref:
+
+``cuda::event_ref`` is a non-owning wrapper around a ``cudaEvent_t``. It prevents unsafe implicit constructions from
+``nullptr`` or integer literals and provides convenient helpers:
+
+- ``record(cuda::stream_ref)``: record the event on a stream
+- ``sync()``: wait for the recorded work to complete
+- ``is_done()``: non-blocking completion query
+- comparison operators against other ``event_ref`` or ``cudaEvent_t``
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+Example:
+
+.. code:: cpp
+
+   #include <cuda/event>
+   #include <cuda/stream>
+
+   void record_on_stream(cuda::stream_ref stream, cudaEvent_t raw_handle) {
+     cuda::event_ref e{raw_handle};
+     e.record(stream);
+   }
+
+.. _cccl-runtime-event-event:
+``cuda::event``
+--------------------------------------------
+
+``cuda::event`` is an owning wrapper around a ``cudaEvent_t`` (with timing disabled). It inherits from ``event_ref`` and provides all of its functionality.
+It also creates and destroys the native event, can be moved (but not copied), and can release ownership via ``release()``. Construction can target a specific
+``cuda::device_ref`` or record immediately on a ``cuda::stream_ref``.
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+.. code:: cpp
+
+   #include <cuda/event>
+   #include <cuda/stream>
+   #include <cuda/devices>
+
+   cuda::std::optional<cuda::event> query_and_record_on_stream(cuda::stream_ref stream) {
+     if (stream.is_done()) {
+      return std::nullopt;
+     }
+     else {
+      return cuda::event{stream};
+     }
+   }
+
+.. _cccl-runtime-event-timed-event:
+``cuda::timed_event``
+-----------------------------------------------------
+
+``cuda::timed_event`` is an owning wrapper for a timed ``cudaEvent_t``. It inherits from ``event`` and provides all of its functionality.
+It also supports elapsed-time queries between two events via ``operator-``, returning ``cuda::std::chrono::nanoseconds``.
+
+Availability: CCCL 3.1.0 / CUDA 13.1
+
+.. code:: cpp
+
+   #include <cuda/event>
+   #include <cuda/stream>
+   #include <cuda/std/chrono>
+
+   template <typename F>
+   cuda::std::chrono::nanoseconds measure_execution_time(cuda::stream_ref stream, F&& f) {
+     cuda::timed_event start{stream};
+     f(stream);
+     cuda::timed_event end{stream};
+     return end - start;
+   }
diff --git a/docs/libcudacxx/runtime/stream.rst b/docs/libcudacxx/runtime/stream.rst