From 26759a82a3382bef4929765f1413058d5f71e109 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 17 Dec 2024 17:36:16 +0100
Subject: [PATCH 01/81] Use Adapt.jl to change storage and element type

In order to eventually support GPU computation we need
to use Adapt.jl to allow GPU backend packages to swap
out host-array types like `CuArray` with device-side types
like `CuDeviceArray`.

Additionally this will allow us to change the element type
of a simulation by using `adapt(Array{Float32}`.

Co-authored-by: Lars Christmann <account-github@l12n.eu>
Co-authored-by: Benedict Geihe <bgeihe@uni-koeln.de>
---
 Project.toml                                  |   2 +
 src/Trixi.jl                                  |   2 +
 src/auxiliary/containers.jl                   |  84 +++++
 src/auxiliary/vector_of_arrays.jl             |  31 ++
 .../semidiscretization_hyperbolic.jl          |  27 +-
 src/solvers/dg.jl                             |   3 +
 src/solvers/dgsem/basis_lobatto_legendre.jl   |  37 +++
 src/solvers/dgsem_p4est/containers.jl         | 314 ++++++++++++++----
 .../dgsem_p4est/containers_parallel.jl        | 114 +++++--
 src/solvers/dgsem_p4est/dg_parallel.jl        |  60 ++--
 .../sort_boundary_conditions.jl               |  17 +-
 test/Project.toml                             |   1 +
 test/test_p4est_2d.jl                         |   6 +
 test/test_unstructured_2d.jl                  |   7 +
 14 files changed, 567 insertions(+), 138 deletions(-)
 create mode 100644 src/auxiliary/vector_of_arrays.jl

diff --git a/Project.toml b/Project.toml
index b53431fd171..204c4088f2f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "
 version = "0.11.16-DEV"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
@@ -63,6 +64,7 @@ TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
 
 [compat]
+Adapt = "4"
 Accessors = "0.1.36"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 8f13835dbae..3844746b777 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -44,6 +44,7 @@ import SciMLBase: get_du, get_tmp_cache, u_modified!,
 
 using DelimitedFiles: readdlm
 using Downloads: Downloads
+using Adapt: Adapt, adapt
 using CodeTracking: CodeTracking
 using ConstructionBase: ConstructionBase
 using DiffEqBase: DiffEqBase, get_tstops, get_tstops_array
@@ -125,6 +126,7 @@ include("basic_types.jl")
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("auxiliary/vector_of_arrays.jl")
 include("auxiliary/mpi.jl")
 include("auxiliary/p4est.jl")
 include("auxiliary/t8code.jl")
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 90650f6abcf..5738467ec6b 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -314,4 +314,88 @@ end
 function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
     raw_copy!(c, c, from, from, destination)
 end
+
+# Trixi storage types must implement these two Adapt.jl methods
+function Adapt.adapt_structure(to, c::AbstractContainer)
+    error("Interface: Must implement Adapt.adapt_structure(to, ::$(typeof(c)))")
+end
+
+function Adapt.parent_type(C::Type{<:AbstractContainer})
+    error("Interface: Must implement Adapt.parent_type(::Type{$C}")
+end
+
+function Adapt.unwrap_type(C::Type{<:AbstractContainer})
+    return Adapt.unwrap_type(Adapt.parent_type(C))
+end
+
+# TODO: Upstream to Adapt
+function storage_type(x)
+    return storage_type(typeof(x))
+end
+
+function storage_type(T::Type)
+    error("Interface: Must implement storage_type(::Type{$T}")
+end
+
+function storage_type(::Type{<:Array})
+    Array
+end
+
+function storage_type(C::Type{<:AbstractContainer})
+    return storage_type(Adapt.unwrap_type(C))
+end
+
+# For some storage backends like CUDA.jl, empty arrays do seem to simply be
+# null pointers which can cause `unsafe_wrap` to fail when calling
+# Adapt.adapt (ArgumentError, see
+# https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/src/array.jl#L212-L229).
+# To circumvent this, on length zero arrays this allocates
+# a separate empty array instead of wrapping.
+# However, since zero length arrays are not used in calculations,
+# it should be okay if the underlying storage vectors and wrapped arrays
+# are not the same as long as they are properly wrapped when `resize!`d etc.
+function unsafe_wrap_or_alloc(to, vector, size)
+    if length(vector) == 0
+        return similar(vector, size)
+    else
+        return unsafe_wrap(to, pointer(vector), size)
+    end
+end
+
+struct TrixiAdaptor{Storage, Real} end
+
+function trixi_adapt(storage, real, x)
+    adapt(TrixiAdaptor{storage, real}(), x)
+end
+
+# Custom rules
+# 1. handling of StaticArrays
+function Adapt.adapt_storage(::TrixiAdaptor{<:Any, Real},
+                             x::StaticArrays.StaticArray{S, T, N}) where {Real, S, T, N}
+    StaticArrays.similar_type(x, Real)(x)
+end
+
+# 2. Handling of Arrays
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: AbstractFloat}
+    adapt(Storage{Real}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: StaticArrays.StaticArray}
+    adapt(Storage{StaticArrays.similar_type(T, Real)}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray) where {Storage, Real}
+    adapt(Storage, x)
+end
+
+# 3. TODO: Should we have a fallback? But that would imply implementing things for NamedTuple again
+
+function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
+    return unsafe_wrap_or_alloc(Storage, vec, size)
+end
 end # @muladd
diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl
new file mode 100644
index 00000000000..0fa8dd7f1ec
--- /dev/null
+++ b/src/auxiliary/vector_of_arrays.jl
@@ -0,0 +1,31 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector.
+# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is
+# not possible for a plain Vector of Arrays.
+struct VecOfArrays{T <: AbstractArray}
+    arrays::Vector{T}
+end
+Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i)
+Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays)
+Base.size(v::VecOfArrays) = Base.size(v.arrays)
+Base.length(v::VecOfArrays) = Base.length(v.arrays)
+Base.eltype(v::VecOfArrays{T}) where {T} = T
+function Adapt.adapt_structure(to, v::VecOfArrays)
+    return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays])
+end
+function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T}
+    return T
+end
+function Adapt.unwrap_type(A::Type{<:VecOfArrays})
+    Adapt.unwrap_type(Adapt.parent_type(A))
+end
+function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray})
+    VecOfArrays(v)
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index c909196b5db..f86be5dc069 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -27,25 +27,6 @@ mutable struct SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
     solver::Solver
     cache::Cache
     performance_counter::PerformanceCounter
-
-    function SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
-                                          BoundaryConditions, SourceTerms, Solver,
-                                          Cache}(mesh::Mesh, equations::Equations,
-                                                 initial_condition::InitialCondition,
-                                                 boundary_conditions::BoundaryConditions,
-                                                 source_terms::SourceTerms,
-                                                 solver::Solver,
-                                                 cache::Cache) where {Mesh, Equations,
-                                                                      InitialCondition,
-                                                                      BoundaryConditions,
-                                                                      SourceTerms,
-                                                                      Solver,
-                                                                      Cache}
-        performance_counter = PerformanceCounter()
-
-        new(mesh, equations, initial_condition, boundary_conditions, source_terms,
-            solver, cache, performance_counter)
-    end
 end
 
 """
@@ -74,6 +55,8 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
 
     check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)
 
+    performance_counter = PerformanceCounter()
+
     SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
                                  typeof(initial_condition),
                                  typeof(_boundary_conditions), typeof(source_terms),
@@ -81,9 +64,13 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
                                                                 initial_condition,
                                                                 _boundary_conditions,
                                                                 source_terms, solver,
-                                                                cache)
+                                                                cache,
+                                                                performance_counter)
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(SemidiscretizationHyperbolic)
+
 # Create a new semidiscretization but change some parameters compared to the input.
 # `Base.similar` follows a related concept but would require us to `copy` the `mesh`,
 # which would impact the performance. Instead, `SciMLBase.remake` has exactly the
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 20b989da334..28774e0029a 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -400,6 +400,9 @@ struct DG{Basis, Mortar, SurfaceIntegral, VolumeIntegral}
     volume_integral::VolumeIntegral
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(DG)
+
 function Base.show(io::IO, dg::DG)
     @nospecialize dg # reduce precompilation time
 
diff --git a/src/solvers/dgsem/basis_lobatto_legendre.jl b/src/solvers/dgsem/basis_lobatto_legendre.jl
index 777348aa8ce..9647f172e20 100644
--- a/src/solvers/dgsem/basis_lobatto_legendre.jl
+++ b/src/solvers/dgsem/basis_lobatto_legendre.jl
@@ -34,6 +34,32 @@ struct LobattoLegendreBasis{RealT <: Real, NNODES,
     # negative adjoint wrt the SBP dot product
 end
 
+function Adapt.adapt_structure(to, basis::LobattoLegendreBasis)
+    inverse_vandermonde_legendre = adapt(to, basis.inverse_vandermonde_legendre)
+    RealT = eltype(inverse_vandermonde_legendre)
+
+    nodes = SVector{<:Any, RealT}(basis.nodes)
+    weights = SVector{<:Any, RealT}(basis.weights)
+    inverse_weights = SVector{<:Any, RealT}(basis.inverse_weights)
+    boundary_interpolation = adapt(to, basis.boundary_interpolation)
+    derivative_matrix = adapt(to, basis.derivative_matrix)
+    derivative_split = adapt(to, basis.derivative_split)
+    derivative_split_transpose = adapt(to, basis.derivative_split_transpose)
+    derivative_dhat = adapt(to, basis.derivative_dhat)
+    return LobattoLegendreBasis{RealT, nnodes(basis), typeof(nodes),
+                                typeof(inverse_vandermonde_legendre),
+                                typeof(boundary_interpolation),
+                                typeof(derivative_matrix)}(nodes,
+                                                           weights,
+                                                           inverse_weights,
+                                                           inverse_vandermonde_legendre,
+                                                           boundary_interpolation,
+                                                           derivative_matrix,
+                                                           derivative_split,
+                                                           derivative_split_transpose,
+                                                           derivative_dhat)
+end
+
 function LobattoLegendreBasis(RealT, polydeg::Integer)
     nnodes_ = polydeg + 1
 
@@ -155,6 +181,17 @@ struct LobattoLegendreMortarL2{RealT <: Real, NNODES,
     reverse_lower::ReverseMatrix
 end
 
+function Adapt.adapt_structure(to, mortar::LobattoLegendreMortarL2)
+    forward_upper = adapt(to, mortar.forward_upper)
+    forward_lower = adapt(to, mortar.forward_lower)
+    reverse_upper = adapt(to, mortar.reverse_upper)
+    reverse_lower = adapt(to, mortar.reverse_lower)
+    return LobattoLegendreMortarL2{eltype(forward_upper), nnodes(mortar),
+                                   typeof(forward_upper),
+                                   typeof(reverse_upper)}(forward_upper, forward_lower,
+                                                          reverse_upper, reverse_lower)
+end
+
 function MortarL2(basis::LobattoLegendreBasis)
     RealT = real(basis)
     nnodes_ = nnodes(basis)
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index a070db6b701..68e5b3d758b 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -6,25 +6,31 @@
 #! format: noindent
 
 mutable struct P4estElementContainer{NDIMS, RealT <: Real, uEltype <: Real, NDIMSP1,
-                                     NDIMSP2, NDIMSP3} <: AbstractContainer
+                                     NDIMSP2, NDIMSP3,
+                                     ArrayNDIMSP1 <: DenseArray{RealT, NDIMSP1},
+                                     ArrayNDIMSP2 <: DenseArray{RealT, NDIMSP2},
+                                     ArrayNDIMSP3 <: DenseArray{RealT, NDIMSP3},
+                                     VectorRealT <: DenseVector{RealT},
+                                     VectoruEltype <: DenseVector{uEltype}} <:
+               AbstractContainer
     # Physical coordinates at each node
-    node_coordinates::Array{RealT, NDIMSP2}   # [orientation, node_i, node_j, node_k, element]
+    node_coordinates::ArrayNDIMSP2   # [orientation, node_i, node_j, node_k, element]
     # Jacobian matrix of the transformation
     # [jacobian_i, jacobian_j, node_i, node_j, node_k, element] where jacobian_i is the first index of the Jacobian matrix,...
-    jacobian_matrix::Array{RealT, NDIMSP3}
+    jacobian_matrix::ArrayNDIMSP3
     # Contravariant vectors, scaled by J, in Kopriva's blue book called Ja^i_n (i index, n dimension)
-    contravariant_vectors::Array{RealT, NDIMSP3}   # [dimension, index, node_i, node_j, node_k, element]
+    contravariant_vectors::ArrayNDIMSP3   # [dimension, index, node_i, node_j, node_k, element]
     # 1/J where J is the Jacobian determinant (determinant of Jacobian matrix)
-    inverse_jacobian::Array{RealT, NDIMSP1}   # [node_i, node_j, node_k, element]
+    inverse_jacobian::ArrayNDIMSP1   # [node_i, node_j, node_k, element]
     # Buffer for calculated surface flux
-    surface_flux_values::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element]
+    surface_flux_values::ArrayNDIMSP2 # [variable, i, j, direction, element]
 
     # internal `resize!`able storage
-    _node_coordinates::Vector{RealT}
-    _jacobian_matrix::Vector{RealT}
-    _contravariant_vectors::Vector{RealT}
-    _inverse_jacobian::Vector{RealT}
-    _surface_flux_values::Vector{uEltype}
+    _node_coordinates::VectorRealT
+    _jacobian_matrix::VectorRealT
+    _contravariant_vectors::VectorRealT
+    _inverse_jacobian::VectorRealT
+    _surface_flux_values::VectoruEltype
 end
 
 @inline function nelements(elements::P4estElementContainer)
@@ -36,7 +42,7 @@ end
                                                                                     RealT,
                                                                                     uEltype
                                                                                     }
-    uEltype
+    return uEltype
 end
 
 # Only one-dimensional `Array`s are `resize!`able in Julia.
@@ -51,28 +57,30 @@ function Base.resize!(elements::P4estElementContainer, capacity)
     n_dims = ndims(elements)
     n_nodes = size(elements.node_coordinates, 2)
     n_variables = size(elements.surface_flux_values, 1)
+    ArrayType = storage_type(elements)
 
     resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity)
-    elements.node_coordinates = unsafe_wrap(Array, pointer(_node_coordinates),
+    elements.node_coordinates = unsafe_wrap(ArrayType, pointer(_node_coordinates),
                                             (n_dims, ntuple(_ -> n_nodes, n_dims)...,
                                              capacity))
 
     resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity)
-    elements.jacobian_matrix = unsafe_wrap(Array, pointer(_jacobian_matrix),
+    elements.jacobian_matrix = unsafe_wrap(ArrayType, pointer(_jacobian_matrix),
                                            (n_dims, n_dims,
                                             ntuple(_ -> n_nodes, n_dims)..., capacity))
 
     resize!(_contravariant_vectors, length(_jacobian_matrix))
-    elements.contravariant_vectors = unsafe_wrap(Array, pointer(_contravariant_vectors),
+    elements.contravariant_vectors = unsafe_wrap(ArrayType,
+                                                 pointer(_contravariant_vectors),
                                                  size(elements.jacobian_matrix))
 
     resize!(_inverse_jacobian, n_nodes^n_dims * capacity)
-    elements.inverse_jacobian = unsafe_wrap(Array, pointer(_inverse_jacobian),
+    elements.inverse_jacobian = unsafe_wrap(ArrayType, pointer(_inverse_jacobian),
                                             (ntuple(_ -> n_nodes, n_dims)..., capacity))
 
     resize!(_surface_flux_values,
             n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity)
-    elements.surface_flux_values = unsafe_wrap(Array, pointer(_surface_flux_values),
+    elements.surface_flux_values = unsafe_wrap(ArrayType, pointer(_surface_flux_values),
                                                (n_variables,
                                                 ntuple(_ -> n_nodes, n_dims - 1)...,
                                                 n_dims * 2, capacity))
@@ -117,33 +125,104 @@ function init_elements(mesh::Union{P4estMesh{NDIMS, NDIMS, RealT},
                                        NDIMS * 2, nelements))
 
     elements = P4estElementContainer{NDIMS, RealT, uEltype, NDIMS + 1, NDIMS + 2,
-                                     NDIMS + 3}(node_coordinates, jacobian_matrix,
-                                                contravariant_vectors,
-                                                inverse_jacobian, surface_flux_values,
-                                                _node_coordinates, _jacobian_matrix,
-                                                _contravariant_vectors,
-                                                _inverse_jacobian, _surface_flux_values)
+                                     NDIMS + 3, Array{RealT, NDIMS + 1},
+                                     Array{RealT, NDIMS + 2}, Array{RealT, NDIMS + 3},
+                                     Vector{RealT}, Vector{uEltype}}(node_coordinates,
+                                                                     jacobian_matrix,
+                                                                     contravariant_vectors,
+                                                                     inverse_jacobian,
+                                                                     surface_flux_values,
+                                                                     _node_coordinates,
+                                                                     _jacobian_matrix,
+                                                                     _contravariant_vectors,
+                                                                     _inverse_jacobian,
+                                                                     _surface_flux_values)
 
     init_elements!(elements, mesh, basis)
     return elements
 end
 
-mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+function Adapt.parent_type(::Type{<:P4estElementContainer{<:Any, <:Any, <:Any, <:Any,
+                                                          <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to,
+                               elements::P4estElementContainer{NDIMS}) where {NDIMS}
+    # Adapt underlying storage
+    _node_coordinates = adapt(to, elements._node_coordinates)
+    _jacobian_matrix = adapt(to, elements._jacobian_matrix)
+    _contravariant_vectors = adapt(to, elements._contravariant_vectors)
+    _inverse_jacobian = adapt(to, elements._inverse_jacobian)
+    _surface_flux_values = adapt(to, elements._surface_flux_values)
+
+    RealT = eltype(_inverse_jacobian)
+    uEltype = eltype(_surface_flux_values)
+
+    # Wrap arrays again
+    node_coordinates = unsafe_wrap_or_alloc(to, _node_coordinates,
+                                            size(elements.node_coordinates))
+    jacobian_matrix = unsafe_wrap_or_alloc(to, _jacobian_matrix,
+                                           size(elements.jacobian_matrix))
+    contravariant_vectors = unsafe_wrap_or_alloc(to, _contravariant_vectors,
+                                                 size(jacobian_matrix))
+    inverse_jacobian = unsafe_wrap_or_alloc(to, _inverse_jacobian,
+                                            size(elements.inverse_jacobian))
+    surface_flux_values = unsafe_wrap_or_alloc(to, _surface_flux_values,
+                                               size(elements.surface_flux_values))
+
+    new_type_params = (NDIMS,
+                       RealT,
+                       uEltype,
+                       NDIMS + 1,
+                       NDIMS + 2,
+                       NDIMS + 3,
+                       typeof(inverse_jacobian), # ArrayNDIMSP1
+                       typeof(node_coordinates), # ArrayNDIMSP2
+                       typeof(jacobian_matrix), # ArrayNDIMSP3
+                       typeof(_node_coordinates), # VectorRealT
+                       typeof(_surface_flux_values)) # VectoruEltype
+    return P4estElementContainer{new_type_params...}(node_coordinates,
+                                                     jacobian_matrix,
+                                                     contravariant_vectors,
+                                                     inverse_jacobian,
+                                                     surface_flux_values,
+                                                     _node_coordinates,
+                                                     _jacobian_matrix,
+                                                     _contravariant_vectors,
+                                                     _inverse_jacobian,
+                                                     _surface_flux_values)
+end
+
+mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                       uArray <: DenseArray{uEltype, NDIMSP2},
+                                       IdsMatrix <: DenseMatrix{Int},
+                                       IndicesMatrix <:
+                                       DenseMatrix{NTuple{NDIMS, Symbol}},
+                                       uVector <: DenseVector{uEltype},
+                                       IdsVector <: DenseVector{Int},
+                                       IndicesVector <:
+                                       DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}       # [primary/secondary, variable, i, j, interface]
-    neighbor_ids::Matrix{Int}                   # [primary/secondary, interface]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [primary/secondary, interface]
+    u::uArray       # [primary/secondary, variable, i, j, interface]
+    neighbor_ids::IdsMatrix                   # [primary/secondary, interface]
+    node_indices::IndicesMatrix # [primary/secondary, interface]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline function ninterfaces(interfaces::P4estInterfaceContainer)
     size(interfaces.neighbor_ids, 2)
 end
 @inline Base.ndims(::P4estInterfaceContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estInterfaceContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                               uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
@@ -152,17 +231,20 @@ function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
     n_dims = ndims(interfaces)
     n_nodes = size(interfaces.u, 3)
     n_variables = size(interfaces.u, 2)
+    ArrayType = storage_type(interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    interfaces.u = unsafe_wrap(Array, pointer(_u),
+    interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
     resize!(_neighbor_ids, 2 * capacity)
-    interfaces.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids), (2, capacity))
+    interfaces.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
+                                          (2, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    interfaces.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    interfaces.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices),
+                                          (2, capacity))
 
     return nothing
 end
@@ -189,10 +271,15 @@ function init_interfaces(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_interfaces)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_interfaces))
 
-    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u, neighbor_ids,
-                                                                    node_indices,
-                                                                    _u, _neighbor_ids,
-                                                                    _node_indices)
+    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                         typeof(u), typeof(neighbor_ids),
+                                         typeof(node_indices), typeof(_u),
+                                         typeof(_neighbor_ids), typeof(_node_indices)}(u,
+                                                                                       neighbor_ids,
+                                                                                       node_indices,
+                                                                                       _u,
+                                                                                       _neighbor_ids,
+                                                                                       _node_indices)
 
     init_interfaces!(interfaces, mesh)
 
@@ -205,21 +292,58 @@ function init_interfaces!(interfaces, mesh::Union{P4estMesh, P4estMeshView})
     return interfaces
 end
 
-mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1} <:
+function Adapt.parent_type(::Type{<:P4estInterfaceContainer{<:Any, <:Any, <:Any,
+                                                            ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, interfaces::P4estInterfaceContainer)
+    # Adapt underlying storage
+    _u = adapt(to, interfaces._u)
+    _neighbor_ids = adapt(to, interfaces._neighbor_ids)
+    _node_indices = adapt(to, interfaces._node_indices)
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(interfaces.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids,
+                                        size(interfaces.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices,
+                                        size(interfaces.node_indices))
+
+    NDIMS = ndims(interfaces)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 2,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estInterfaceContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                       _u, _neighbor_ids, _node_indices)
+end
+
+mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1,
+                                      uArray <: DenseArray{uEltype, NDIMSP1},
+                                      IdsVector <: DenseVector{Int},
+                                      IndicesVector <:
+                                      DenseVector{NTuple{NDIMS, Symbol}},
+                                      uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP1}       # [variables, i, j, boundary]
-    neighbor_ids::Vector{Int}                   # [boundary]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [boundary]
+    u::uArray       # [variables, i, j, boundary]
+    neighbor_ids::IdsVector                 # [boundary]
+    node_indices::IndicesVector # [boundary]
     name::Vector{Symbol}                # [boundary]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nboundaries(boundaries::P4estBoundaryContainer)
     length(boundaries.neighbor_ids)
 end
 @inline Base.ndims(::P4estBoundaryContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estBoundaryContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                              uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
@@ -228,9 +352,10 @@ function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
     n_dims = ndims(boundaries)
     n_nodes = size(boundaries.u, 2)
     n_variables = size(boundaries.u, 1)
+    ArrayType = storage_type(boundaries)
 
     resize!(_u, n_variables * n_nodes^(n_dims - 1) * capacity)
-    boundaries.u = unsafe_wrap(Array, pointer(_u),
+    boundaries.u = unsafe_wrap(ArrayType, pointer(_u),
                                (n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
@@ -263,9 +388,11 @@ function init_boundaries(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, n_boundaries)
     names = Vector{Symbol}(undef, n_boundaries)
 
-    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1}(u, neighbor_ids,
-                                                                   node_indices, names,
-                                                                   _u)
+    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1, typeof(u),
+                                        typeof(neighbor_ids), typeof(node_indices),
+                                        typeof(_u)}(u, neighbor_ids,
+                                                    node_indices, names,
+                                                    _u)
 
     if n_boundaries > 0
         init_boundaries!(boundaries, mesh)
@@ -312,6 +439,25 @@ function init_boundaries_iter_face_inner(info_pw, boundaries, boundary_id, mesh)
     return nothing
 end
 
+function Adapt.parent_type(::Type{<:P4estBoundaryContainer{<:Any, <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, boundaries::P4estBoundaryContainer)
+    _u = adapt(to, boundaries._u)
+    u = unsafe_wrap_or_alloc(to, _u, size(boundaries.u))
+    neighbor_ids = adapt(to, boundaries.neighbor_ids)
+    node_indices = adapt(to, boundaries.node_indices)
+    name = boundaries.name
+
+    NDIMS = ndims(boundaries)
+    return P4estBoundaryContainer{NDIMS, eltype(_u), NDIMS + 1, typeof(u),
+                                  typeof(neighbor_ids), typeof(node_indices),
+                                  typeof(_u)}(u, neighbor_ids, node_indices,
+                                              name, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # The positions used in `neighbor_ids` are 1:3 (in 2D) or 1:5 (in 3D), where 1:2 (in 2D)
@@ -337,20 +483,32 @@ end
 # │ └─────────────┴─────────────┘  └───────────────────────────┘
 # │
 # ⋅────> ξ
-mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3} <:
+mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3,
+                                    uArray <: DenseArray{uEltype, NDIMSP3},
+                                    IdsMatrix <: DenseMatrix{Int},
+                                    IndicesMatrix <:
+                                    DenseMatrix{NTuple{NDIMS, Symbol}},
+                                    uVector <: DenseVector{uEltype},
+                                    IdsVector <: DenseVector{Int},
+                                    IndicesVector <:
+                                    DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP3} # [small/large side, variable, position, i, j, mortar]
-    neighbor_ids::Matrix{Int}             # [position, mortar]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    u::uArray # [small/large side, variable, position, i, j, mortar]
+    neighbor_ids::IdsMatrix # [position, mortar]
+    node_indices::IndicesMatrix # [small/large, mortar]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline nmortars(mortars::P4estMortarContainer) = size(mortars.neighbor_ids, 2)
 @inline Base.ndims(::P4estMortarContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estMortarContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                            uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(mortars::P4estMortarContainer, capacity)
@@ -359,18 +517,19 @@ function Base.resize!(mortars::P4estMortarContainer, capacity)
     n_dims = ndims(mortars)
     n_nodes = size(mortars.u, 4)
     n_variables = size(mortars.u, 2)
+    ArrayType = storage_type(mortars)
 
     resize!(_u, 2 * n_variables * 2^(n_dims - 1) * n_nodes^(n_dims - 1) * capacity)
-    mortars.u = unsafe_wrap(Array, pointer(_u),
+    mortars.u = unsafe_wrap(ArrayType, pointer(_u),
                             (2, n_variables, 2^(n_dims - 1),
                              ntuple(_ -> n_nodes, n_dims - 1)..., capacity))
 
     resize!(_neighbor_ids, (2^(n_dims - 1) + 1) * capacity)
-    mortars.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids),
+    mortars.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
                                        (2^(n_dims - 1) + 1, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    mortars.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    mortars.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices), (2, capacity))
 
     return nothing
 end
@@ -398,12 +557,15 @@ function init_mortars(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equatio
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_mortars)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_mortars))
 
-    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3}(u,
-                                                                         neighbor_ids,
-                                                                         node_indices,
-                                                                         _u,
-                                                                         _neighbor_ids,
-                                                                         _node_indices)
+    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3, typeof(u),
+                                   typeof(neighbor_ids), typeof(node_indices),
+                                   typeof(_u), typeof(_neighbor_ids),
+                                   typeof(_node_indices)}(u,
+                                                          neighbor_ids,
+                                                          node_indices,
+                                                          _u,
+                                                          _neighbor_ids,
+                                                          _node_indices)
 
     if n_mortars > 0
         init_mortars!(mortars, mesh)
@@ -418,6 +580,34 @@ function init_mortars!(mortars, mesh::Union{P4estMesh, P4estMeshView})
     return mortars
 end
 
+function Adapt.parent_type(::Type{<:P4estMortarContainer{<:Any, <:Any, <:Any, <:Any,
+                                                         ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mortars::P4estMortarContainer)
+    # Adapt underlying storage
+    _u = adapt(to, mortars._u)
+    _neighbor_ids = adapt(to, mortars._neighbor_ids)
+    _node_indices = adapt(to, mortars._node_indices)
+
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(mortars.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids, size(mortars.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices, size(mortars.node_indices))
+
+    NDIMS = ndims(mortars)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 1,
+                       NDIMS + 3,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estMortarContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                    _u, _neighbor_ids, _node_indices)
+end
+
 function reinitialize_containers!(mesh::P4estMesh, equations, dg::DGSEM, cache)
     # Re-initialize elements container
     @unpack elements = cache
diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl
index 676b37efff3..cb9cd1ffc95 100644
--- a/src/solvers/dgsem_p4est/containers_parallel.jl
+++ b/src/solvers/dgsem_p4est/containers_parallel.jl
@@ -5,15 +5,19 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                          uArray <: DenseArray{uEltype, NDIMSP2},
+                                          VecInt <: DenseVector{Int},
+                                          IndicesVector <:
+                                          DenseVector{NTuple{NDIMS, Symbol}},
+                                          uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}                  # [primary/secondary, variable, i, j, interface]
-    local_neighbor_ids::Vector{Int}             # [interface]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [interface]
-    local_sides::Vector{Int}                    # [interface]
-
+    u::uArray                   # [primary/secondary, variable, i, j, interface]
+    local_neighbor_ids::VecInt  # [interface]
+    node_indices::IndicesVector # [interface]
+    local_sides::VecInt         # [interface]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nmpiinterfaces(interfaces::P4estMPIInterfaceContainer)
@@ -27,9 +31,10 @@ function Base.resize!(mpi_interfaces::P4estMPIInterfaceContainer, capacity)
     n_dims = ndims(mpi_interfaces)
     n_nodes = size(mpi_interfaces.u, 3)
     n_variables = size(mpi_interfaces.u, 2)
+    ArrayType = storage_type(mpi_interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    mpi_interfaces.u = unsafe_wrap(Array, pointer(_u),
+    mpi_interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                    (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                     capacity))
 
@@ -64,11 +69,13 @@ function init_mpi_interfaces(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh},
 
     local_sides = Vector{Int}(undef, n_mpi_interfaces)
 
-    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u,
-                                                                           local_neighbor_ids,
-                                                                           node_indices,
-                                                                           local_sides,
-                                                                           _u)
+    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                                typeof(u), typeof(local_neighbor_ids),
+                                                typeof(node_indices), typeof(_u)}(u,
+                                                                                  local_neighbor_ids,
+                                                                                  node_indices,
+                                                                                  local_sides,
+                                                                                  _u)
 
     init_mpi_interfaces!(mpi_interfaces, mesh)
 
@@ -81,6 +88,32 @@ function init_mpi_interfaces!(mpi_interfaces, mesh::ParallelP4estMesh)
     return mpi_interfaces
 end
 
+function Adapt.parent_type(::Type{<:Trixi.P4estMPIInterfaceContainer{<:Any, <:Any,
+                                                                     <:Any, A}}) where {A}
+    return A
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mpi_interfaces::P4estMPIInterfaceContainer)
+    # Adapt Vectors and underlying storage
+    _u = adapt(to, mpi_interfaces._u)
+    local_neighbor_ids = adapt(to, mpi_interfaces.local_neighbor_ids)
+    node_indices = adapt(to, mpi_interfaces.node_indices)
+    local_sides = adapt(to, mpi_interfaces.local_sides)
+
+    # Wrap array again
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_interfaces.u))
+
+    NDIMS = ndims(mpi_interfaces)
+    return P4estMPIInterfaceContainer{NDIMS, eltype(u),
+                                      NDIMS + 2,
+                                      typeof(u), typeof(local_neighbor_ids),
+                                      typeof(node_indices), typeof(_u)}(u,
+                                                                        local_neighbor_ids,
+                                                                        node_indices,
+                                                                        local_sides, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # Similar to `P4estMortarContainer`. The field `neighbor_ids` has been split up into
@@ -88,14 +121,17 @@ end
 # available elements belonging to a particular MPI mortar. Furthermore, `normal_directions` holds
 # the normal vectors on the surface of the small elements for each mortar.
 mutable struct P4estMPIMortarContainer{NDIMS, uEltype <: Real, RealT <: Real, NDIMSP1,
-                                       NDIMSP2, NDIMSP3} <: AbstractContainer
-    u::Array{uEltype, NDIMSP3}                    # [small/large side, variable, position, i, j, mortar]
-    local_neighbor_ids::Vector{Vector{Int}}       # [mortar][ids]
-    local_neighbor_positions::Vector{Vector{Int}} # [mortar][positions]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}}   # [small/large, mortar]
-    normal_directions::Array{RealT, NDIMSP2}      # [dimension, i, j, position, mortar]
+                                       NDIMSP2, NDIMSP3,
+                                       uArray <: DenseArray{uEltype, NDIMSP3},
+                                       uVector <: DenseVector{uEltype}} <:
+               AbstractContainer
+    u::uArray                                      # [small/large side, variable, position, i, j, mortar]
+    local_neighbor_ids::Vector{Vector{Int}}        # [mortar][ids]
+    local_neighbor_positions::Vector{Vector{Int}}  # [mortar][positions]
+    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    normal_directions::Array{RealT, NDIMSP2}       # [dimension, i, j, position, mortar]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
     _node_indices::Vector{NTuple{NDIMS, Symbol}}
     _normal_directions::Vector{RealT}
 end
@@ -164,11 +200,12 @@ function init_mpi_mortars(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh}, eq
                                      2^(NDIMS - 1), n_mpi_mortars))
 
     mpi_mortars = P4estMPIMortarContainer{NDIMS, uEltype, RealT, NDIMS + 1, NDIMS + 2,
-                                          NDIMS + 3}(u, local_neighbor_ids,
-                                                     local_neighbor_positions,
-                                                     node_indices, normal_directions,
-                                                     _u, _node_indices,
-                                                     _normal_directions)
+                                          NDIMS + 3, typeof(u),
+                                          typeof(_u)}(u, local_neighbor_ids,
+                                                      local_neighbor_positions,
+                                                      node_indices, normal_directions,
+                                                      _u, _node_indices,
+                                                      _normal_directions)
 
     if n_mpi_mortars > 0
         init_mpi_mortars!(mpi_mortars, mesh, basis, elements)
@@ -184,6 +221,33 @@ function init_mpi_mortars!(mpi_mortars, mesh::ParallelP4estMesh, basis, elements
     return mpi_mortars
 end
 
+function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer)
+    # TODO: Vector of Vector type data structure does not work on GPUs,
+    # must be redesigned. This skeleton implementation here just exists just
+    # for compatibility with the rest of the KA.jl solver code
+
+    _u = adapt(to, mpi_mortars._u)
+    _node_indices = mpi_mortars._node_indices
+    _normal_directions = mpi_mortars._normal_directions
+
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_mortars.u))
+    local_neighbor_ids = mpi_mortars.local_neighbor_ids
+    local_neighbor_positions = mpi_mortars.local_neighbor_positions
+    node_indices = mpi_mortars.node_indices
+    normal_directions = mpi_mortars.normal_directions
+
+    NDIMS = ndims(mpi_mortars)
+    return P4estMPIMortarContainer{NDIMS, eltype(_u),
+                                   eltype(_normal_directions),
+                                   NDIMS + 1, NDIMS + 2, NDIMS + 3,
+                                   typeof(u), typeof(_u)}(u, local_neighbor_ids,
+                                                          local_neighbor_positions,
+                                                          node_indices,
+                                                          normal_directions, _u,
+                                                          _node_indices,
+                                                          _normal_directions)
+end
+
 # Overload init! function for regular interfaces, regular mortars and boundaries since they must
 # call the appropriate init_surfaces! function for parallel p4est meshes
 function init_interfaces!(interfaces, mesh::ParallelP4estMesh)
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index 2cc201dd1f0..7acddf07b4b 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -5,12 +5,13 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPICache{uEltype}
+mutable struct P4estMPICache{BufferType <: DenseVector,
+                             VecInt <: DenseVector{<:Integer}}
     mpi_neighbor_ranks::Vector{Int}
-    mpi_neighbor_interfaces::Vector{Vector{Int}}
-    mpi_neighbor_mortars::Vector{Vector{Int}}
-    mpi_send_buffers::Vector{Vector{uEltype}}
-    mpi_recv_buffers::Vector{Vector{uEltype}}
+    mpi_neighbor_interfaces::VecOfArrays{VecInt}
+    mpi_neighbor_mortars::VecOfArrays{VecInt}
+    mpi_send_buffers::VecOfArrays{BufferType}
+    mpi_recv_buffers::VecOfArrays{BufferType}
     mpi_send_requests::Vector{MPI.Request}
     mpi_recv_requests::Vector{MPI.Request}
     n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
@@ -25,25 +26,29 @@ function P4estMPICache(uEltype)
     end
 
     mpi_neighbor_ranks = Vector{Int}(undef, 0)
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0)
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0)
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0)
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0)
+    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
+    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
     mpi_send_requests = Vector{MPI.Request}(undef, 0)
     mpi_recv_requests = Vector{MPI.Request}(undef, 0)
     n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1)
     n_elements_global = 0
     first_element_global_id = 0
 
-    P4estMPICache{uEltype}(mpi_neighbor_ranks, mpi_neighbor_interfaces,
-                           mpi_neighbor_mortars,
-                           mpi_send_buffers, mpi_recv_buffers,
-                           mpi_send_requests, mpi_recv_requests,
-                           n_elements_by_rank, n_elements_global,
-                           first_element_global_id)
+    P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks,
+                                                mpi_neighbor_interfaces,
+                                                mpi_neighbor_mortars,
+                                                mpi_send_buffers, mpi_recv_buffers,
+                                                mpi_send_requests, mpi_recv_requests,
+                                                n_elements_by_rank, n_elements_global,
+                                                first_element_global_id)
 end
 
-@inline Base.eltype(::P4estMPICache{uEltype}) where {uEltype} = uEltype
+@inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType)
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(P4estMPICache)
 
 ##
 # Note that the code in `start_mpi_send`/`finish_mpi_receive!` is sensitive to inference on (at least) Julia 1.10.
@@ -265,16 +270,16 @@ end
 
 function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
                          mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype)
-    mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
-                                                                                                       mpi_mortars,
-                                                                                                       mesh)
+    mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
+                                                                                                         mpi_mortars,
+                                                                                                         mesh)
 
-    mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces,
-                                                                                                        mpi_neighbor_mortars,
-                                                                                                        ndims(mesh),
-                                                                                                        nvars,
-                                                                                                        n_nodes,
-                                                                                                        uEltype)
+    _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces,
+                                                                                                          _mpi_neighbor_mortars,
+                                                                                                          ndims(mesh),
+                                                                                                          nvars,
+                                                                                                          n_nodes,
+                                                                                                          uEltype)
 
     # Determine local and total number of elements
     n_elements_global = Int(mesh.p4est.global_num_quadrants[])
@@ -286,6 +291,11 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
     first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1
     @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements"
 
+    mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces)
+    mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars)
+    mpi_send_buffers = VecOfArrays(_mpi_send_buffers)
+    mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers)
+
     # TODO reuse existing structures
     @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces,
                        mpi_neighbor_mortars,
diff --git a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
index 0cb3bd7f409..d6cf6e1ce6d 100644
--- a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
+++ b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
@@ -13,9 +13,10 @@ It stores a set of global indices for each boundary condition type and name to e
 during the call to `calc_boundary_flux!`. The original dictionary form of the boundary conditions
 set by the user in the elixir file is also stored for printing.
 """
-mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any}}
+mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any},
+                                               Vec <: AbstractVector{<:Integer}}
     boundary_condition_types::BCs # specific boundary condition type(s), e.g. BoundaryConditionDirichlet
-    boundary_indices::NTuple{N, Vector{Int}} # integer vectors containing global boundary indices
+    boundary_indices::NTuple{N, Vec} # integer vectors containing global boundary indices
     boundary_dictionary::Dict{Symbol, Any} # boundary conditions as set by the user in the elixir file
     boundary_symbol_indices::Dict{Symbol, Vector{Int}} # integer vectors containing global boundary indices per boundary identifier
 end
@@ -33,10 +34,11 @@ function UnstructuredSortedBoundaryTypes(boundary_conditions::Dict, cache)
     boundary_symbol_indices = Dict{Symbol, Vector{Int}}()
 
     container = UnstructuredSortedBoundaryTypes{n_boundary_types,
-                                                typeof(boundary_condition_types)}(boundary_condition_types,
-                                                                                  boundary_indices,
-                                                                                  boundary_conditions,
-                                                                                  boundary_symbol_indices)
+                                                typeof(boundary_condition_types),
+                                                Vector{Int}}(boundary_condition_types,
+                                                             boundary_indices,
+                                                             boundary_conditions,
+                                                             boundary_symbol_indices)
 
     initialize!(container, cache)
 end
@@ -119,4 +121,7 @@ function initialize!(boundary_types_container::UnstructuredSortedBoundaryTypes{N
 
     return boundary_types_container
 end
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(UnstructuredSortedBoundaryTypes)
 end # @muladd
diff --git a/test/Project.toml b/test/Project.toml
index ec1a13a4bd1..c399dd967bf 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 4b1c7f5caca..b1472cb99cf 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -27,6 +27,12 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(semi32.mesh) == Float64
 end
 
 @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
diff --git a/test/test_unstructured_2d.jl b/test/test_unstructured_2d.jl
index 259eb39c545..c3291c3ba9d 100644
--- a/test/test_unstructured_2d.jl
+++ b/test/test_unstructured_2d.jl
@@ -2,6 +2,7 @@ module TestExamplesUnstructuredMesh2D
 
 using Test
 using Trixi
+using Adapt
 
 include("test_trixi.jl")
 
@@ -32,6 +33,12 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh as well
+    @test real(semi32.mesh) == Float64
 end
 
 @trixi_testset "elixir_euler_free_stream.jl" begin

From fc610f9c7a0bcee83150ad984777c23d16665122 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 21 Apr 2025 18:37:41 +0200
Subject: [PATCH 02/81] add docs and CUDAExt

---
 Project.toml              |  7 +++-
 docs/make.jl              |  3 +-
 docs/src/heterogeneous.md | 82 +++++++++++++++++++++++++++++++++++++++
 ext/TrixiCUDAExt.jl       | 11 ++++++
 4 files changed, 100 insertions(+), 3 deletions(-)
 create mode 100644 docs/src/heterogeneous.md
 create mode 100644 ext/TrixiCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index 204c4088f2f..5afb3d64225 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,8 +4,8 @@ authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "
 version = "0.11.16-DEV"
 
 [deps]
-Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -57,15 +57,18 @@ Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
+TrixiCUDAExt = "CUDA"
 
 [compat]
-Adapt = "4"
 Accessors = "0.1.36"
+Adapt = "4"
+CUDA = "5"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
diff --git a/docs/make.jl b/docs/make.jl
index 60c11c5d2d1..a115294cc90 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -163,7 +163,8 @@ makedocs(
                  "Style guide" => "styleguide.md",
                  "Testing" => "testing.md",
                  "Performance" => "performance.md",
-                 "Parallelization" => "parallelization.md"
+                 "Parallelization" => "parallelization.md",
+                 "Heterogeneous" => "heterogeneous.md"
              ],
              "Troubleshooting and FAQ" => "troubleshooting.md",
              "Reference" => [
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
new file mode 100644
index 00000000000..60bda029a40
--- /dev/null
+++ b/docs/src/heterogeneous.md
@@ -0,0 +1,82 @@
+# Heterogeneous computing
+
+Support for heterogeneous computing is currently being worked on.
+
+## The use of Adapt.jl
+
+[`Adapt.jl`](https://github.com/JuliaGPU/Adapt.jl) is a package in the JuliaGPU family that allows for
+the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
+at the storage leaves with a GPU array like `CuArray`.
+
+To facilitate this data structures must be parameterized, so instead of:
+
+```julia
+struct Container
+   data::Array{Float64,2}
+end
+```
+
+They must be written as:
+
+```julia
+struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+   data::D
+end
+```
+
+furthermore, we need to define a function that allows for the conversion of storage
+of our types: 
+
+```julia
+function Adapt.adapt_structure(to, C::Container)
+    return Container(adapt(to, C.data))
+end
+```
+
+or simply
+
+```julia
+Adapt.@adapt_structure(Container)
+```
+
+additionally, we must define `Adapt.parent_type`.
+
+```julia
+function Adapt.parent_type(::Type{<:Container{D}}) where D
+    return D
+end
+```
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+Array
+
+julia> using CUDA
+
+julia> GPU_C = adapt(CuArray, C)
+Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+CuArray
+```
+
+## Element-type conversion with `Trixi.trixi_adapt`.
+
+We can use Trixi.trixi_adapt to perform both an element-type and a storage-type adoption
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(Array, Float32, C)
+Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(CuArray, Float32, C)
+Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0])
+```
+
+!!! note
+    `adapt(Array{Float32}, C)` is tempting but will do the wrong thing in the presence of `StaticArrays`.
\ No newline at end of file
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
new file mode 100644
index 00000000000..681d2f53a1e
--- /dev/null
+++ b/ext/TrixiCUDAExt.jl
@@ -0,0 +1,11 @@
+# Package extension for adding CUDA-based features to Trixi.jl
+module TrixiCUDAExt
+
+import CUDA: CuArray
+import Trixi
+
+function Trixi.storage_type(::Type{<:CuArray})
+    return CuArray
+end
+
+end

From 7b5d81b1c09653bb50c4c214f2acbde9dfe9140a Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 21 Apr 2025 21:35:04 +0200
Subject: [PATCH 03/81] Aqua set unbound_args

---
 test/test_aqua.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_aqua.jl b/test/test_aqua.jl
index 9b3f2d67903..154088995ca 100644
--- a/test/test_aqua.jl
+++ b/test/test_aqua.jl
@@ -10,6 +10,7 @@ include("test_trixi.jl")
 @timed_testset "Aqua.jl" begin
     Aqua.test_all(Trixi,
                   ambiguities = false,
+                  unbound_args = false, # FIXME: UnstructuredSortedBoundaryTypes
                   # exceptions necessary for adding a new method `StartUpDG.estimate_h`
                   # in src/solvers/dgmulti/sbp.jl
                   piracies = (treat_as_own = [Trixi.StartUpDG.RefElemData,

From f730ef410e5b9450ae5f18821731799f3b1725d5 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 09:26:23 +0200
Subject: [PATCH 04/81] lower bound CUDA to 5.2

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 5afb3d64225..3ce2daf16f9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -68,7 +68,7 @@ TrixiCUDAExt = "CUDA"
 [compat]
 Accessors = "0.1.36"
 Adapt = "4"
-CUDA = "5"
+CUDA = "5.2"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"

From 13b7f590b2604f53b92a681a51fe21582fc5c8eb Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 21 Apr 2025 17:16:18 +0200
Subject: [PATCH 05/81] add initial CUDA pipeline

---
 .buildkite/pipeline.yml |  9 ++++++---
 test/Project.toml       |  1 +
 test/runtests.jl        |  9 +++++++++
 test/test_cuda.jl       | 20 ++++++++++++++++++++
 4 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 test/test_cuda.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 0f8ad475db8..344b8eacc3a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,3 +1,5 @@
+env:
+
 steps:
   - label: "CUDA Julia {{matrix.version}}"
     matrix:
@@ -7,12 +9,13 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
-    command: |
-      true
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "CUDA"
     agents:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip ci\]/
     timeout_in_minutes: 60
     soft_fail:
-      - exit_status: 3
\ No newline at end of file
+      - exit_status: 3
diff --git a/test/Project.toml b/test/Project.toml
index c399dd967bf..206654281d9 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -4,6 +4,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
diff --git a/test/runtests.jl b/test/runtests.jl
index a9dfc4cb999..d08ff018837 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -116,4 +116,13 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "paper_self_gravitating_gas_dynamics"
         include("test_paper_self_gravitating_gas_dynamics.jl")
     end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
+        import CUDA
+        if CUDA.functional()
+            include("test_cuda.jl")
+        else
+            @warn "Unable to run CUDA tests on this machine"
+        end
+    end
 end
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
new file mode 100644
index 00000000000..f2fd11233c6
--- /dev/null
+++ b/test/test_cuda.jl
@@ -0,0 +1,20 @@
+module TestCUDA
+
+using CUDA
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# EXAMPLES_DIR = joinpath(examples_dir(), "dgmulti_1d")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+# TODO:
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+
+end # module

From 02de7d256adcdb4d2bd72cc7a98140f24648dacd Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 10:08:37 +0200
Subject: [PATCH 06/81] add storage_type, real_type to semidiscretize

---
 .../p4est_2d_dgsem/elixir_advection_basic.jl  |  2 +-
 src/semidiscretization/semidiscretization.jl  | 21 ++++++++++++++++++-
 test/test_p4est_2d.jl                         | 21 +++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic.jl b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
index a87f1582121..33a049a3a1e 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
@@ -31,7 +31,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0))
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index f41c7ea4a7f..91599f4d63b 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -82,9 +82,15 @@ end
 
 Wrap the semidiscretization `semi` as an ODE problem in the time interval `tspan`
 that can be passed to `solve` from the [SciML ecosystem](https://diffeq.sciml.ai/latest/).
+
+The optional keyword arguments `storage_type` and `real_type` configure the underlying computational
+datastructures. `storage_type` changes the fundamental array type being used, allowing the
+experimental use of `CuArray` or other GPU array types. `real_type` changes the computational data type being used.
 """
 function semidiscretize(semi::AbstractSemidiscretization, tspan;
-                        reset_threads = true)
+                        reset_threads = true,
+                        storage_type = nothing,
+                        real_type = nothing)
     # Optionally reset Polyester.jl threads. See
     # https://github.com/trixi-framework/Trixi.jl/issues/1583
     # https://github.com/JuliaSIMD/Polyester.jl/issues/30
@@ -92,6 +98,19 @@ function semidiscretize(semi::AbstractSemidiscretization, tspan;
         Polyester.reset_threads!()
     end
 
+    if !(storage_type === nothing && real_type === nothing)
+        if storage_type === nothing
+            storage_type = Array
+        end
+        if real_type === nothing
+            real_type = Float64
+        end
+        semi = trixi_adapt(storage_type, real_type, semi)
+        if eltype(tspan) !== real_type
+            tspan = convert.(real_type, tspan)
+        end
+    end
+
     u0_ode = compute_coefficients(first(tspan), semi)
     # TODO: MPI, do we want to synchronize loading and print debug statements, e.g. using
     #       mpi_isparallel() && MPI.Barrier(mpi_comm())
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index b1472cb99cf..f436faffaa1 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -35,6 +35,27 @@ isdir(outdir) && rm(outdir, recursive = true)
     @test real(semi32.mesh) == Float64
 end
 
+@trixi_testset "elixir_advection_basic.jl (Float32)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[8.311947673061856e-6],
+                        linf=[6.627000273229378e-5],
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+end
+
 @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_advection_nonconforming_flag.jl"),

From 671f5b16b065ba8bf2e832f2469d351083c17929 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 10:25:33 +0200
Subject: [PATCH 07/81] add GPU construction test

---
 .../elixir_advection_basic_gpu.jl             | 60 +++++++++++++++++++
 test/test_cuda.jl                             | 24 +++++++-
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
new file mode 100644
index 00000000000..4e26ec3df1a
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,60 @@
+# The same setup as tree_2d_dgsem/elixir_advection_basic.jl
+# to verify the StructuredMesh implementation against TreeMesh
+
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
+
+trees_per_dimension = (8, 8)
+
+# Create P4estMesh with 8 x 8 trees and 16 x 16 elements
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+# sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+#             dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+#             ode_default_options()..., callback = callbacks);
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index f2fd11233c6..68872266986 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -12,7 +12,29 @@ include("test_trixi.jl")
 outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
-# TODO:
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+@trixi_testset "elixir_advection_basic.jl (Float32)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[8.311947673061856e-6],
+                        linf=[6.627000273229378e-5],
+                        real_type=Float32,
+                        storage_type=CuArray)
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+end
 
 # Clean up afterwards: delete Trixi.jl output directory
 @test_nowarn isdir(outdir) && rm(outdir, recursive = true)

From ecd09a59063135fb2bf981e86b3c5d21ed1fae26 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 12:08:26 +0200
Subject: [PATCH 08/81] don't adapt Array{MArray}

---
 src/auxiliary/containers.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 5738467ec6b..edc42db382b 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -388,6 +388,13 @@ function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
     adapt(Storage{StaticArrays.similar_type(T, Real)}, x)
 end
 
+# Our threaded cache contains MArray, it is unlikely that we would want to adapt those
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::Array{T}) where {Storage, Real,
+                                                 T <: StaticArrays.MArray}
+    adapt(Array{StaticArrays.similar_type(T, Real)}, x)
+end
+
 function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
                              x::AbstractArray) where {Storage, Real}
     adapt(Storage, x)

From 312009af58e70430a7f00cd751ed3acaaea8def5 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 13:36:22 +0200
Subject: [PATCH 09/81] add some more cuda adapt tests

---
 test/test_cuda.jl | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 68872266986..7a218f236d3 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -19,7 +19,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
                         # Expected errors are exactly the same as with TreeMesh!
                         l2=[8.311947673061856e-6],
                         linf=[6.627000273229378e-5],
-                        real_type=Float32,
+                        real_type=Float64,
                         storage_type=CuArray)
     # # Ensure that we do not have excessive memory allocations
     # # (e.g., from type instabilities)
@@ -34,6 +34,17 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
     @test real(ode.p.solver.mortar) == Float32
     # TODO: remake ignores the mesh itself as well
     @test real(ode.p.mesh) == Float64
+
+    @test_broken ode.u0 isa CuArray
+    @test ode.p.basis.boundary_interpolations isa CuArray
+    @test ode.p.basis.derivative_matrix isa CuArray
+
+    @test ode.p.basis.forward_upper isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortrar) === CuArray
 end
 
 # Clean up afterwards: delete Trixi.jl output directory

From 690efd1de65cbb4a34448fef15c78786c2fc4c69 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 28 Apr 2025 16:18:18 +0200
Subject: [PATCH 10/81] use sources for dev branch

---
 .buildkite/pipeline.yml | 2 +-
 test/Project.toml       | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 344b8eacc3a..fdb4a855961 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -5,7 +5,7 @@ steps:
     matrix:
       setup:
         version:
-          - "1.10"
+          - "1.11"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
diff --git a/test/Project.toml b/test/Project.toml
index 206654281d9..77e50547a4f 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -59,3 +59,6 @@ Random = "1"
 StableRNGs = "1.0.2"
 Test = "1"
 TrixiTest = "0.1"
+
+[sources]
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "vc/unsafe_wrap_symbols"}

From 15a898b773573a4742baa186468962a4b6d39c7c Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 8 May 2025 11:50:42 +0200
Subject: [PATCH 11/81] fixup! use sources for dev branch

---
 test/Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Project.toml b/test/Project.toml
index 77e50547a4f..71ad1ca24e2 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -61,4 +61,4 @@ Test = "1"
 TrixiTest = "0.1"
 
 [sources]
-CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "vc/unsafe_wrap_symbols"}
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}

From 45d344bdeb6661a04c1b8f5cd4a3e41ac844157f Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 14 May 2025 10:38:54 +0200
Subject: [PATCH 12/81] use released version of CUDA

---
 .github/workflows/GPUCompat.yml | 86 ---------------------------------
 Project.toml                    |  2 +-
 test/Project.toml               |  3 --
 3 files changed, 1 insertion(+), 90 deletions(-)
 delete mode 100644 .github/workflows/GPUCompat.yml

diff --git a/.github/workflows/GPUCompat.yml b/.github/workflows/GPUCompat.yml
deleted file mode 100644
index 335e1c83c4c..00000000000
--- a/.github/workflows/GPUCompat.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: GPU Package Compatibility
-
-on:
-  pull_request:
-    paths-ignore:
-      - 'AUTHORS.md'
-      - 'CITATION.bib'
-      - 'CONTRIBUTING.md'
-      - 'LICENSE.md'
-      - 'NEWS.md'
-      - 'README.md'
-      - '.zenodo.json'
-      - '.github/workflows/benchmark.yml'
-      - '.github/workflows/CompatHelper.yml'
-      - '.github/workflows/TagBot.yml'
-      - 'benchmark/**'
-      - 'docs/**'
-      - 'utils/**'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    if: "!contains(github.event.head_commit.message, 'skip ci')"
-    name: ${{ matrix.os }} - ${{ matrix.arch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - version: '1.10'
-            os: ubuntu-latest
-            arch: x64
-          - version: '1.10'
-            os: windows-latest
-            arch: x64
-          # CUDA.jl only supports 64-bit Linux and Windows, see https://github.com/JuliaGPU/CUDA.jl?tab=readme-ov-file#requirements
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Julia
-        uses: julia-actions/setup-julia@v2
-        with:
-          version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
-
-      - name: Display version info
-        run: julia -e 'using InteractiveUtils; versioninfo(verbose=true)'
-
-      - name: Cache Julia packages
-        uses: julia-actions/cache@v2
-
-      - name: Build project
-        uses: julia-actions/julia-buildpkg@v1
-
-      # Only CUDA.jl is needed for GPU compatibility test now
-      - name: Add CUDA.jl to environment
-        run: |
-          julia --project=. -e '
-          using Pkg; 
-          Pkg.activate(temp=true);
-          Pkg.develop(PackageSpec(path=pwd())); 
-          Pkg.add("CUDA");
-          Pkg.update()'
-
-    #   - name: Add Metal.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("Metal");
-    #       Pkg.update()'
-
-    #   - name: Add AMDGPU.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("AMDGPU");
-    #       Pkg.update()'
diff --git a/Project.toml b/Project.toml
index 3ce2daf16f9..f16e133231d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -68,7 +68,7 @@ TrixiCUDAExt = "CUDA"
 [compat]
 Accessors = "0.1.36"
 Adapt = "4"
-CUDA = "5.2"
+CUDA = "5.8"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
diff --git a/test/Project.toml b/test/Project.toml
index 71ad1ca24e2..206654281d9 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -59,6 +59,3 @@ Random = "1"
 StableRNGs = "1.0.2"
 Test = "1"
 TrixiTest = "0.1"
-
-[sources]
-CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}

From 7e72effd09762722cb6a1dee9cfc9e7fa8114c77 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 14 May 2025 10:43:30 +0200
Subject: [PATCH 13/81] Update .buildkite/pipeline.yml

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index fdb4a855961..344b8eacc3a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -5,7 +5,7 @@ steps:
     matrix:
       setup:
         version:
-          - "1.11"
+          - "1.10"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"

From 3450dddcdc19347412161d747e817cfef3124e78 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 17 Dec 2024 17:36:16 +0100
Subject: [PATCH 14/81] Use Adapt.jl to change storage and element type

In order to eventually support GPU computation we need
to use Adapt.jl to allow GPU backend packages to swap
out host-array types like `CuArray` with device-side types
like `CuDeviceArray`.

Additionally this will allow us to change the element type
of a simulation by using `adapt(Array{Float32}`.

Co-authored-by: Lars Christmann <account-github@l12n.eu>
Co-authored-by: Benedict Geihe <bgeihe@uni-koeln.de>
---
 Project.toml                                  |   2 +
 src/Trixi.jl                                  |   2 +
 src/auxiliary/containers.jl                   |  84 +++++
 src/auxiliary/vector_of_arrays.jl             |  31 ++
 .../semidiscretization_hyperbolic.jl          |  27 +-
 src/solvers/dg.jl                             |   3 +
 src/solvers/dgsem/basis_lobatto_legendre.jl   |  37 +++
 src/solvers/dgsem_p4est/containers.jl         | 314 ++++++++++++++----
 .../dgsem_p4est/containers_parallel.jl        | 114 +++++--
 src/solvers/dgsem_p4est/dg_parallel.jl        |  60 ++--
 .../sort_boundary_conditions.jl               |  17 +-
 test/Project.toml                             |   1 +
 test/test_p4est_2d.jl                         |   6 +
 test/test_unstructured_2d.jl                  |   7 +
 14 files changed, 567 insertions(+), 138 deletions(-)
 create mode 100644 src/auxiliary/vector_of_arrays.jl

diff --git a/Project.toml b/Project.toml
index 5af41465607..e10c47ff1be 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "
 version = "0.12.5-DEV"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
@@ -63,6 +64,7 @@ TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
 
 [compat]
+Adapt = "4"
 Accessors = "0.1.36"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a707437655e..a52dfd6d973 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -50,6 +50,7 @@ import SciMLBase: get_du, get_tmp_cache, u_modified!,
 
 using DelimitedFiles: readdlm
 using Downloads: Downloads
+using Adapt: Adapt, adapt
 using CodeTracking: CodeTracking
 using ConstructionBase: ConstructionBase
 using DiffEqBase: DiffEqBase, get_tstops, get_tstops_array
@@ -132,6 +133,7 @@ include("basic_types.jl")
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("auxiliary/vector_of_arrays.jl")
 include("auxiliary/mpi.jl")
 include("auxiliary/p4est.jl")
 include("auxiliary/t8code.jl")
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 90650f6abcf..5738467ec6b 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -314,4 +314,88 @@ end
 function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
     raw_copy!(c, c, from, from, destination)
 end
+
+# Trixi storage types must implement these two Adapt.jl methods
+function Adapt.adapt_structure(to, c::AbstractContainer)
+    error("Interface: Must implement Adapt.adapt_structure(to, ::$(typeof(c)))")
+end
+
+function Adapt.parent_type(C::Type{<:AbstractContainer})
+    error("Interface: Must implement Adapt.parent_type(::Type{$C}")
+end
+
+function Adapt.unwrap_type(C::Type{<:AbstractContainer})
+    return Adapt.unwrap_type(Adapt.parent_type(C))
+end
+
+# TODO: Upstream to Adapt
+function storage_type(x)
+    return storage_type(typeof(x))
+end
+
+function storage_type(T::Type)
+    error("Interface: Must implement storage_type(::Type{$T}")
+end
+
+function storage_type(::Type{<:Array})
+    Array
+end
+
+function storage_type(C::Type{<:AbstractContainer})
+    return storage_type(Adapt.unwrap_type(C))
+end
+
+# For some storage backends like CUDA.jl, empty arrays do seem to simply be
+# null pointers which can cause `unsafe_wrap` to fail when calling
+# Adapt.adapt (ArgumentError, see
+# https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/src/array.jl#L212-L229).
+# To circumvent this, on length zero arrays this allocates
+# a separate empty array instead of wrapping.
+# However, since zero length arrays are not used in calculations,
+# it should be okay if the underlying storage vectors and wrapped arrays
+# are not the same as long as they are properly wrapped when `resize!`d etc.
+function unsafe_wrap_or_alloc(to, vector, size)
+    if length(vector) == 0
+        return similar(vector, size)
+    else
+        return unsafe_wrap(to, pointer(vector), size)
+    end
+end
+
+struct TrixiAdaptor{Storage, Real} end
+
+function trixi_adapt(storage, real, x)
+    adapt(TrixiAdaptor{storage, real}(), x)
+end
+
+# Custom rules
+# 1. handling of StaticArrays
+function Adapt.adapt_storage(::TrixiAdaptor{<:Any, Real},
+                             x::StaticArrays.StaticArray{S, T, N}) where {Real, S, T, N}
+    StaticArrays.similar_type(x, Real)(x)
+end
+
+# 2. Handling of Arrays
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: AbstractFloat}
+    adapt(Storage{Real}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: StaticArrays.StaticArray}
+    adapt(Storage{StaticArrays.similar_type(T, Real)}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray) where {Storage, Real}
+    adapt(Storage, x)
+end
+
+# 3. TODO: Should we have a fallback? But that would imply implementing things for NamedTuple again
+
+function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
+    return unsafe_wrap_or_alloc(Storage, vec, size)
+end
 end # @muladd
diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl
new file mode 100644
index 00000000000..0fa8dd7f1ec
--- /dev/null
+++ b/src/auxiliary/vector_of_arrays.jl
@@ -0,0 +1,31 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector.
+# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is
+# not possible for a plain Vector of Arrays.
+struct VecOfArrays{T <: AbstractArray}
+    arrays::Vector{T}
+end
+Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i)
+Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays)
+Base.size(v::VecOfArrays) = Base.size(v.arrays)
+Base.length(v::VecOfArrays) = Base.length(v.arrays)
+Base.eltype(v::VecOfArrays{T}) where {T} = T
+function Adapt.adapt_structure(to, v::VecOfArrays)
+    return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays])
+end
+function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T}
+    return T
+end
+function Adapt.unwrap_type(A::Type{<:VecOfArrays})
+    Adapt.unwrap_type(Adapt.parent_type(A))
+end
+function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray})
+    VecOfArrays(v)
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index 7496a345661..2a563c02229 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -27,25 +27,6 @@ mutable struct SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
     solver::Solver
     cache::Cache
     performance_counter::PerformanceCounter
-
-    function SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
-                                          BoundaryConditions, SourceTerms, Solver,
-                                          Cache}(mesh::Mesh, equations::Equations,
-                                                 initial_condition::InitialCondition,
-                                                 boundary_conditions::BoundaryConditions,
-                                                 source_terms::SourceTerms,
-                                                 solver::Solver,
-                                                 cache::Cache) where {Mesh, Equations,
-                                                                      InitialCondition,
-                                                                      BoundaryConditions,
-                                                                      SourceTerms,
-                                                                      Solver,
-                                                                      Cache}
-        performance_counter = PerformanceCounter()
-
-        new(mesh, equations, initial_condition, boundary_conditions, source_terms,
-            solver, cache, performance_counter)
-    end
 end
 
 """
@@ -71,6 +52,8 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
 
     check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)
 
+    performance_counter = PerformanceCounter()
+
     SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
                                  typeof(initial_condition),
                                  typeof(_boundary_conditions), typeof(source_terms),
@@ -78,9 +61,13 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
                                                                 initial_condition,
                                                                 _boundary_conditions,
                                                                 source_terms, solver,
-                                                                cache)
+                                                                cache,
+                                                                performance_counter)
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(SemidiscretizationHyperbolic)
+
 # Create a new semidiscretization but change some parameters compared to the input.
 # `Base.similar` follows a related concept but would require us to `copy` the `mesh`,
 # which would impact the performance. Instead, `SciMLBase.remake` has exactly the
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index ad211b3c003..78f3901a346 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -415,6 +415,9 @@ struct DG{Basis, Mortar, SurfaceIntegral, VolumeIntegral}
     volume_integral::VolumeIntegral
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(DG)
+
 function Base.show(io::IO, dg::DG)
     @nospecialize dg # reduce precompilation time
 
diff --git a/src/solvers/dgsem/basis_lobatto_legendre.jl b/src/solvers/dgsem/basis_lobatto_legendre.jl
index 777348aa8ce..9647f172e20 100644
--- a/src/solvers/dgsem/basis_lobatto_legendre.jl
+++ b/src/solvers/dgsem/basis_lobatto_legendre.jl
@@ -34,6 +34,32 @@ struct LobattoLegendreBasis{RealT <: Real, NNODES,
     # negative adjoint wrt the SBP dot product
 end
 
+function Adapt.adapt_structure(to, basis::LobattoLegendreBasis)
+    inverse_vandermonde_legendre = adapt(to, basis.inverse_vandermonde_legendre)
+    RealT = eltype(inverse_vandermonde_legendre)
+
+    nodes = SVector{<:Any, RealT}(basis.nodes)
+    weights = SVector{<:Any, RealT}(basis.weights)
+    inverse_weights = SVector{<:Any, RealT}(basis.inverse_weights)
+    boundary_interpolation = adapt(to, basis.boundary_interpolation)
+    derivative_matrix = adapt(to, basis.derivative_matrix)
+    derivative_split = adapt(to, basis.derivative_split)
+    derivative_split_transpose = adapt(to, basis.derivative_split_transpose)
+    derivative_dhat = adapt(to, basis.derivative_dhat)
+    return LobattoLegendreBasis{RealT, nnodes(basis), typeof(nodes),
+                                typeof(inverse_vandermonde_legendre),
+                                typeof(boundary_interpolation),
+                                typeof(derivative_matrix)}(nodes,
+                                                           weights,
+                                                           inverse_weights,
+                                                           inverse_vandermonde_legendre,
+                                                           boundary_interpolation,
+                                                           derivative_matrix,
+                                                           derivative_split,
+                                                           derivative_split_transpose,
+                                                           derivative_dhat)
+end
+
 function LobattoLegendreBasis(RealT, polydeg::Integer)
     nnodes_ = polydeg + 1
 
@@ -155,6 +181,17 @@ struct LobattoLegendreMortarL2{RealT <: Real, NNODES,
     reverse_lower::ReverseMatrix
 end
 
+function Adapt.adapt_structure(to, mortar::LobattoLegendreMortarL2)
+    forward_upper = adapt(to, mortar.forward_upper)
+    forward_lower = adapt(to, mortar.forward_lower)
+    reverse_upper = adapt(to, mortar.reverse_upper)
+    reverse_lower = adapt(to, mortar.reverse_lower)
+    return LobattoLegendreMortarL2{eltype(forward_upper), nnodes(mortar),
+                                   typeof(forward_upper),
+                                   typeof(reverse_upper)}(forward_upper, forward_lower,
+                                                          reverse_upper, reverse_lower)
+end
+
 function MortarL2(basis::LobattoLegendreBasis)
     RealT = real(basis)
     nnodes_ = nnodes(basis)
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index a070db6b701..68e5b3d758b 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -6,25 +6,31 @@
 #! format: noindent
 
 mutable struct P4estElementContainer{NDIMS, RealT <: Real, uEltype <: Real, NDIMSP1,
-                                     NDIMSP2, NDIMSP3} <: AbstractContainer
+                                     NDIMSP2, NDIMSP3,
+                                     ArrayNDIMSP1 <: DenseArray{RealT, NDIMSP1},
+                                     ArrayNDIMSP2 <: DenseArray{RealT, NDIMSP2},
+                                     ArrayNDIMSP3 <: DenseArray{RealT, NDIMSP3},
+                                     VectorRealT <: DenseVector{RealT},
+                                     VectoruEltype <: DenseVector{uEltype}} <:
+               AbstractContainer
     # Physical coordinates at each node
-    node_coordinates::Array{RealT, NDIMSP2}   # [orientation, node_i, node_j, node_k, element]
+    node_coordinates::ArrayNDIMSP2   # [orientation, node_i, node_j, node_k, element]
     # Jacobian matrix of the transformation
     # [jacobian_i, jacobian_j, node_i, node_j, node_k, element] where jacobian_i is the first index of the Jacobian matrix,...
-    jacobian_matrix::Array{RealT, NDIMSP3}
+    jacobian_matrix::ArrayNDIMSP3
     # Contravariant vectors, scaled by J, in Kopriva's blue book called Ja^i_n (i index, n dimension)
-    contravariant_vectors::Array{RealT, NDIMSP3}   # [dimension, index, node_i, node_j, node_k, element]
+    contravariant_vectors::ArrayNDIMSP3   # [dimension, index, node_i, node_j, node_k, element]
     # 1/J where J is the Jacobian determinant (determinant of Jacobian matrix)
-    inverse_jacobian::Array{RealT, NDIMSP1}   # [node_i, node_j, node_k, element]
+    inverse_jacobian::ArrayNDIMSP1   # [node_i, node_j, node_k, element]
     # Buffer for calculated surface flux
-    surface_flux_values::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element]
+    surface_flux_values::ArrayNDIMSP2 # [variable, i, j, direction, element]
 
     # internal `resize!`able storage
-    _node_coordinates::Vector{RealT}
-    _jacobian_matrix::Vector{RealT}
-    _contravariant_vectors::Vector{RealT}
-    _inverse_jacobian::Vector{RealT}
-    _surface_flux_values::Vector{uEltype}
+    _node_coordinates::VectorRealT
+    _jacobian_matrix::VectorRealT
+    _contravariant_vectors::VectorRealT
+    _inverse_jacobian::VectorRealT
+    _surface_flux_values::VectoruEltype
 end
 
 @inline function nelements(elements::P4estElementContainer)
@@ -36,7 +42,7 @@ end
                                                                                     RealT,
                                                                                     uEltype
                                                                                     }
-    uEltype
+    return uEltype
 end
 
 # Only one-dimensional `Array`s are `resize!`able in Julia.
@@ -51,28 +57,30 @@ function Base.resize!(elements::P4estElementContainer, capacity)
     n_dims = ndims(elements)
     n_nodes = size(elements.node_coordinates, 2)
     n_variables = size(elements.surface_flux_values, 1)
+    ArrayType = storage_type(elements)
 
     resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity)
-    elements.node_coordinates = unsafe_wrap(Array, pointer(_node_coordinates),
+    elements.node_coordinates = unsafe_wrap(ArrayType, pointer(_node_coordinates),
                                             (n_dims, ntuple(_ -> n_nodes, n_dims)...,
                                              capacity))
 
     resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity)
-    elements.jacobian_matrix = unsafe_wrap(Array, pointer(_jacobian_matrix),
+    elements.jacobian_matrix = unsafe_wrap(ArrayType, pointer(_jacobian_matrix),
                                            (n_dims, n_dims,
                                             ntuple(_ -> n_nodes, n_dims)..., capacity))
 
     resize!(_contravariant_vectors, length(_jacobian_matrix))
-    elements.contravariant_vectors = unsafe_wrap(Array, pointer(_contravariant_vectors),
+    elements.contravariant_vectors = unsafe_wrap(ArrayType,
+                                                 pointer(_contravariant_vectors),
                                                  size(elements.jacobian_matrix))
 
     resize!(_inverse_jacobian, n_nodes^n_dims * capacity)
-    elements.inverse_jacobian = unsafe_wrap(Array, pointer(_inverse_jacobian),
+    elements.inverse_jacobian = unsafe_wrap(ArrayType, pointer(_inverse_jacobian),
                                             (ntuple(_ -> n_nodes, n_dims)..., capacity))
 
     resize!(_surface_flux_values,
             n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity)
-    elements.surface_flux_values = unsafe_wrap(Array, pointer(_surface_flux_values),
+    elements.surface_flux_values = unsafe_wrap(ArrayType, pointer(_surface_flux_values),
                                                (n_variables,
                                                 ntuple(_ -> n_nodes, n_dims - 1)...,
                                                 n_dims * 2, capacity))
@@ -117,33 +125,104 @@ function init_elements(mesh::Union{P4estMesh{NDIMS, NDIMS, RealT},
                                        NDIMS * 2, nelements))
 
     elements = P4estElementContainer{NDIMS, RealT, uEltype, NDIMS + 1, NDIMS + 2,
-                                     NDIMS + 3}(node_coordinates, jacobian_matrix,
-                                                contravariant_vectors,
-                                                inverse_jacobian, surface_flux_values,
-                                                _node_coordinates, _jacobian_matrix,
-                                                _contravariant_vectors,
-                                                _inverse_jacobian, _surface_flux_values)
+                                     NDIMS + 3, Array{RealT, NDIMS + 1},
+                                     Array{RealT, NDIMS + 2}, Array{RealT, NDIMS + 3},
+                                     Vector{RealT}, Vector{uEltype}}(node_coordinates,
+                                                                     jacobian_matrix,
+                                                                     contravariant_vectors,
+                                                                     inverse_jacobian,
+                                                                     surface_flux_values,
+                                                                     _node_coordinates,
+                                                                     _jacobian_matrix,
+                                                                     _contravariant_vectors,
+                                                                     _inverse_jacobian,
+                                                                     _surface_flux_values)
 
     init_elements!(elements, mesh, basis)
     return elements
 end
 
-mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+function Adapt.parent_type(::Type{<:P4estElementContainer{<:Any, <:Any, <:Any, <:Any,
+                                                          <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to,
+                               elements::P4estElementContainer{NDIMS}) where {NDIMS}
+    # Adapt underlying storage
+    _node_coordinates = adapt(to, elements._node_coordinates)
+    _jacobian_matrix = adapt(to, elements._jacobian_matrix)
+    _contravariant_vectors = adapt(to, elements._contravariant_vectors)
+    _inverse_jacobian = adapt(to, elements._inverse_jacobian)
+    _surface_flux_values = adapt(to, elements._surface_flux_values)
+
+    RealT = eltype(_inverse_jacobian)
+    uEltype = eltype(_surface_flux_values)
+
+    # Wrap arrays again
+    node_coordinates = unsafe_wrap_or_alloc(to, _node_coordinates,
+                                            size(elements.node_coordinates))
+    jacobian_matrix = unsafe_wrap_or_alloc(to, _jacobian_matrix,
+                                           size(elements.jacobian_matrix))
+    contravariant_vectors = unsafe_wrap_or_alloc(to, _contravariant_vectors,
+                                                 size(jacobian_matrix))
+    inverse_jacobian = unsafe_wrap_or_alloc(to, _inverse_jacobian,
+                                            size(elements.inverse_jacobian))
+    surface_flux_values = unsafe_wrap_or_alloc(to, _surface_flux_values,
+                                               size(elements.surface_flux_values))
+
+    new_type_params = (NDIMS,
+                       RealT,
+                       uEltype,
+                       NDIMS + 1,
+                       NDIMS + 2,
+                       NDIMS + 3,
+                       typeof(inverse_jacobian), # ArrayNDIMSP1
+                       typeof(node_coordinates), # ArrayNDIMSP2
+                       typeof(jacobian_matrix), # ArrayNDIMSP3
+                       typeof(_node_coordinates), # VectorRealT
+                       typeof(_surface_flux_values)) # VectoruEltype
+    return P4estElementContainer{new_type_params...}(node_coordinates,
+                                                     jacobian_matrix,
+                                                     contravariant_vectors,
+                                                     inverse_jacobian,
+                                                     surface_flux_values,
+                                                     _node_coordinates,
+                                                     _jacobian_matrix,
+                                                     _contravariant_vectors,
+                                                     _inverse_jacobian,
+                                                     _surface_flux_values)
+end
+
+mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                       uArray <: DenseArray{uEltype, NDIMSP2},
+                                       IdsMatrix <: DenseMatrix{Int},
+                                       IndicesMatrix <:
+                                       DenseMatrix{NTuple{NDIMS, Symbol}},
+                                       uVector <: DenseVector{uEltype},
+                                       IdsVector <: DenseVector{Int},
+                                       IndicesVector <:
+                                       DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}       # [primary/secondary, variable, i, j, interface]
-    neighbor_ids::Matrix{Int}                   # [primary/secondary, interface]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [primary/secondary, interface]
+    u::uArray       # [primary/secondary, variable, i, j, interface]
+    neighbor_ids::IdsMatrix                   # [primary/secondary, interface]
+    node_indices::IndicesMatrix # [primary/secondary, interface]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline function ninterfaces(interfaces::P4estInterfaceContainer)
     size(interfaces.neighbor_ids, 2)
 end
 @inline Base.ndims(::P4estInterfaceContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estInterfaceContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                               uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
@@ -152,17 +231,20 @@ function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
     n_dims = ndims(interfaces)
     n_nodes = size(interfaces.u, 3)
     n_variables = size(interfaces.u, 2)
+    ArrayType = storage_type(interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    interfaces.u = unsafe_wrap(Array, pointer(_u),
+    interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
     resize!(_neighbor_ids, 2 * capacity)
-    interfaces.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids), (2, capacity))
+    interfaces.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
+                                          (2, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    interfaces.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    interfaces.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices),
+                                          (2, capacity))
 
     return nothing
 end
@@ -189,10 +271,15 @@ function init_interfaces(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_interfaces)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_interfaces))
 
-    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u, neighbor_ids,
-                                                                    node_indices,
-                                                                    _u, _neighbor_ids,
-                                                                    _node_indices)
+    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                         typeof(u), typeof(neighbor_ids),
+                                         typeof(node_indices), typeof(_u),
+                                         typeof(_neighbor_ids), typeof(_node_indices)}(u,
+                                                                                       neighbor_ids,
+                                                                                       node_indices,
+                                                                                       _u,
+                                                                                       _neighbor_ids,
+                                                                                       _node_indices)
 
     init_interfaces!(interfaces, mesh)
 
@@ -205,21 +292,58 @@ function init_interfaces!(interfaces, mesh::Union{P4estMesh, P4estMeshView})
     return interfaces
 end
 
-mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1} <:
+function Adapt.parent_type(::Type{<:P4estInterfaceContainer{<:Any, <:Any, <:Any,
+                                                            ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, interfaces::P4estInterfaceContainer)
+    # Adapt underlying storage
+    _u = adapt(to, interfaces._u)
+    _neighbor_ids = adapt(to, interfaces._neighbor_ids)
+    _node_indices = adapt(to, interfaces._node_indices)
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(interfaces.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids,
+                                        size(interfaces.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices,
+                                        size(interfaces.node_indices))
+
+    NDIMS = ndims(interfaces)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 2,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estInterfaceContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                       _u, _neighbor_ids, _node_indices)
+end
+
+mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1,
+                                      uArray <: DenseArray{uEltype, NDIMSP1},
+                                      IdsVector <: DenseVector{Int},
+                                      IndicesVector <:
+                                      DenseVector{NTuple{NDIMS, Symbol}},
+                                      uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP1}       # [variables, i, j, boundary]
-    neighbor_ids::Vector{Int}                   # [boundary]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [boundary]
+    u::uArray       # [variables, i, j, boundary]
+    neighbor_ids::IdsVector                 # [boundary]
+    node_indices::IndicesVector # [boundary]
     name::Vector{Symbol}                # [boundary]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nboundaries(boundaries::P4estBoundaryContainer)
     length(boundaries.neighbor_ids)
 end
 @inline Base.ndims(::P4estBoundaryContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estBoundaryContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                              uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
@@ -228,9 +352,10 @@ function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
     n_dims = ndims(boundaries)
     n_nodes = size(boundaries.u, 2)
     n_variables = size(boundaries.u, 1)
+    ArrayType = storage_type(boundaries)
 
     resize!(_u, n_variables * n_nodes^(n_dims - 1) * capacity)
-    boundaries.u = unsafe_wrap(Array, pointer(_u),
+    boundaries.u = unsafe_wrap(ArrayType, pointer(_u),
                                (n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
@@ -263,9 +388,11 @@ function init_boundaries(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, n_boundaries)
     names = Vector{Symbol}(undef, n_boundaries)
 
-    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1}(u, neighbor_ids,
-                                                                   node_indices, names,
-                                                                   _u)
+    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1, typeof(u),
+                                        typeof(neighbor_ids), typeof(node_indices),
+                                        typeof(_u)}(u, neighbor_ids,
+                                                    node_indices, names,
+                                                    _u)
 
     if n_boundaries > 0
         init_boundaries!(boundaries, mesh)
@@ -312,6 +439,25 @@ function init_boundaries_iter_face_inner(info_pw, boundaries, boundary_id, mesh)
     return nothing
 end
 
+function Adapt.parent_type(::Type{<:P4estBoundaryContainer{<:Any, <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, boundaries::P4estBoundaryContainer)
+    _u = adapt(to, boundaries._u)
+    u = unsafe_wrap_or_alloc(to, _u, size(boundaries.u))
+    neighbor_ids = adapt(to, boundaries.neighbor_ids)
+    node_indices = adapt(to, boundaries.node_indices)
+    name = boundaries.name
+
+    NDIMS = ndims(boundaries)
+    return P4estBoundaryContainer{NDIMS, eltype(_u), NDIMS + 1, typeof(u),
+                                  typeof(neighbor_ids), typeof(node_indices),
+                                  typeof(_u)}(u, neighbor_ids, node_indices,
+                                              name, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # The positions used in `neighbor_ids` are 1:3 (in 2D) or 1:5 (in 3D), where 1:2 (in 2D)
@@ -337,20 +483,32 @@ end
 # │ └─────────────┴─────────────┘  └───────────────────────────┘
 # │
 # ⋅────> ξ
-mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3} <:
+mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3,
+                                    uArray <: DenseArray{uEltype, NDIMSP3},
+                                    IdsMatrix <: DenseMatrix{Int},
+                                    IndicesMatrix <:
+                                    DenseMatrix{NTuple{NDIMS, Symbol}},
+                                    uVector <: DenseVector{uEltype},
+                                    IdsVector <: DenseVector{Int},
+                                    IndicesVector <:
+                                    DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP3} # [small/large side, variable, position, i, j, mortar]
-    neighbor_ids::Matrix{Int}             # [position, mortar]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    u::uArray # [small/large side, variable, position, i, j, mortar]
+    neighbor_ids::IdsMatrix # [position, mortar]
+    node_indices::IndicesMatrix # [small/large, mortar]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline nmortars(mortars::P4estMortarContainer) = size(mortars.neighbor_ids, 2)
 @inline Base.ndims(::P4estMortarContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estMortarContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                            uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(mortars::P4estMortarContainer, capacity)
@@ -359,18 +517,19 @@ function Base.resize!(mortars::P4estMortarContainer, capacity)
     n_dims = ndims(mortars)
     n_nodes = size(mortars.u, 4)
     n_variables = size(mortars.u, 2)
+    ArrayType = storage_type(mortars)
 
     resize!(_u, 2 * n_variables * 2^(n_dims - 1) * n_nodes^(n_dims - 1) * capacity)
-    mortars.u = unsafe_wrap(Array, pointer(_u),
+    mortars.u = unsafe_wrap(ArrayType, pointer(_u),
                             (2, n_variables, 2^(n_dims - 1),
                              ntuple(_ -> n_nodes, n_dims - 1)..., capacity))
 
     resize!(_neighbor_ids, (2^(n_dims - 1) + 1) * capacity)
-    mortars.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids),
+    mortars.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
                                        (2^(n_dims - 1) + 1, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    mortars.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    mortars.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices), (2, capacity))
 
     return nothing
 end
@@ -398,12 +557,15 @@ function init_mortars(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equatio
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_mortars)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_mortars))
 
-    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3}(u,
-                                                                         neighbor_ids,
-                                                                         node_indices,
-                                                                         _u,
-                                                                         _neighbor_ids,
-                                                                         _node_indices)
+    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3, typeof(u),
+                                   typeof(neighbor_ids), typeof(node_indices),
+                                   typeof(_u), typeof(_neighbor_ids),
+                                   typeof(_node_indices)}(u,
+                                                          neighbor_ids,
+                                                          node_indices,
+                                                          _u,
+                                                          _neighbor_ids,
+                                                          _node_indices)
 
     if n_mortars > 0
         init_mortars!(mortars, mesh)
@@ -418,6 +580,34 @@ function init_mortars!(mortars, mesh::Union{P4estMesh, P4estMeshView})
     return mortars
 end
 
+function Adapt.parent_type(::Type{<:P4estMortarContainer{<:Any, <:Any, <:Any, <:Any,
+                                                         ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mortars::P4estMortarContainer)
+    # Adapt underlying storage
+    _u = adapt(to, mortars._u)
+    _neighbor_ids = adapt(to, mortars._neighbor_ids)
+    _node_indices = adapt(to, mortars._node_indices)
+
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(mortars.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids, size(mortars.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices, size(mortars.node_indices))
+
+    NDIMS = ndims(mortars)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 1,
+                       NDIMS + 3,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estMortarContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                    _u, _neighbor_ids, _node_indices)
+end
+
 function reinitialize_containers!(mesh::P4estMesh, equations, dg::DGSEM, cache)
     # Re-initialize elements container
     @unpack elements = cache
diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl
index 676b37efff3..cb9cd1ffc95 100644
--- a/src/solvers/dgsem_p4est/containers_parallel.jl
+++ b/src/solvers/dgsem_p4est/containers_parallel.jl
@@ -5,15 +5,19 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                          uArray <: DenseArray{uEltype, NDIMSP2},
+                                          VecInt <: DenseVector{Int},
+                                          IndicesVector <:
+                                          DenseVector{NTuple{NDIMS, Symbol}},
+                                          uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}                  # [primary/secondary, variable, i, j, interface]
-    local_neighbor_ids::Vector{Int}             # [interface]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [interface]
-    local_sides::Vector{Int}                    # [interface]
-
+    u::uArray                   # [primary/secondary, variable, i, j, interface]
+    local_neighbor_ids::VecInt  # [interface]
+    node_indices::IndicesVector # [interface]
+    local_sides::VecInt         # [interface]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nmpiinterfaces(interfaces::P4estMPIInterfaceContainer)
@@ -27,9 +31,10 @@ function Base.resize!(mpi_interfaces::P4estMPIInterfaceContainer, capacity)
     n_dims = ndims(mpi_interfaces)
     n_nodes = size(mpi_interfaces.u, 3)
     n_variables = size(mpi_interfaces.u, 2)
+    ArrayType = storage_type(mpi_interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    mpi_interfaces.u = unsafe_wrap(Array, pointer(_u),
+    mpi_interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                    (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                     capacity))
 
@@ -64,11 +69,13 @@ function init_mpi_interfaces(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh},
 
     local_sides = Vector{Int}(undef, n_mpi_interfaces)
 
-    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u,
-                                                                           local_neighbor_ids,
-                                                                           node_indices,
-                                                                           local_sides,
-                                                                           _u)
+    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                                typeof(u), typeof(local_neighbor_ids),
+                                                typeof(node_indices), typeof(_u)}(u,
+                                                                                  local_neighbor_ids,
+                                                                                  node_indices,
+                                                                                  local_sides,
+                                                                                  _u)
 
     init_mpi_interfaces!(mpi_interfaces, mesh)
 
@@ -81,6 +88,32 @@ function init_mpi_interfaces!(mpi_interfaces, mesh::ParallelP4estMesh)
     return mpi_interfaces
 end
 
+function Adapt.parent_type(::Type{<:Trixi.P4estMPIInterfaceContainer{<:Any, <:Any,
+                                                                     <:Any, A}}) where {A}
+    return A
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mpi_interfaces::P4estMPIInterfaceContainer)
+    # Adapt Vectors and underlying storage
+    _u = adapt(to, mpi_interfaces._u)
+    local_neighbor_ids = adapt(to, mpi_interfaces.local_neighbor_ids)
+    node_indices = adapt(to, mpi_interfaces.node_indices)
+    local_sides = adapt(to, mpi_interfaces.local_sides)
+
+    # Wrap array again
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_interfaces.u))
+
+    NDIMS = ndims(mpi_interfaces)
+    return P4estMPIInterfaceContainer{NDIMS, eltype(u),
+                                      NDIMS + 2,
+                                      typeof(u), typeof(local_neighbor_ids),
+                                      typeof(node_indices), typeof(_u)}(u,
+                                                                        local_neighbor_ids,
+                                                                        node_indices,
+                                                                        local_sides, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # Similar to `P4estMortarContainer`. The field `neighbor_ids` has been split up into
@@ -88,14 +121,17 @@ end
 # available elements belonging to a particular MPI mortar. Furthermore, `normal_directions` holds
 # the normal vectors on the surface of the small elements for each mortar.
 mutable struct P4estMPIMortarContainer{NDIMS, uEltype <: Real, RealT <: Real, NDIMSP1,
-                                       NDIMSP2, NDIMSP3} <: AbstractContainer
-    u::Array{uEltype, NDIMSP3}                    # [small/large side, variable, position, i, j, mortar]
-    local_neighbor_ids::Vector{Vector{Int}}       # [mortar][ids]
-    local_neighbor_positions::Vector{Vector{Int}} # [mortar][positions]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}}   # [small/large, mortar]
-    normal_directions::Array{RealT, NDIMSP2}      # [dimension, i, j, position, mortar]
+                                       NDIMSP2, NDIMSP3,
+                                       uArray <: DenseArray{uEltype, NDIMSP3},
+                                       uVector <: DenseVector{uEltype}} <:
+               AbstractContainer
+    u::uArray                                      # [small/large side, variable, position, i, j, mortar]
+    local_neighbor_ids::Vector{Vector{Int}}        # [mortar][ids]
+    local_neighbor_positions::Vector{Vector{Int}}  # [mortar][positions]
+    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    normal_directions::Array{RealT, NDIMSP2}       # [dimension, i, j, position, mortar]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
     _node_indices::Vector{NTuple{NDIMS, Symbol}}
     _normal_directions::Vector{RealT}
 end
@@ -164,11 +200,12 @@ function init_mpi_mortars(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh}, eq
                                      2^(NDIMS - 1), n_mpi_mortars))
 
     mpi_mortars = P4estMPIMortarContainer{NDIMS, uEltype, RealT, NDIMS + 1, NDIMS + 2,
-                                          NDIMS + 3}(u, local_neighbor_ids,
-                                                     local_neighbor_positions,
-                                                     node_indices, normal_directions,
-                                                     _u, _node_indices,
-                                                     _normal_directions)
+                                          NDIMS + 3, typeof(u),
+                                          typeof(_u)}(u, local_neighbor_ids,
+                                                      local_neighbor_positions,
+                                                      node_indices, normal_directions,
+                                                      _u, _node_indices,
+                                                      _normal_directions)
 
     if n_mpi_mortars > 0
         init_mpi_mortars!(mpi_mortars, mesh, basis, elements)
@@ -184,6 +221,33 @@ function init_mpi_mortars!(mpi_mortars, mesh::ParallelP4estMesh, basis, elements
     return mpi_mortars
 end
 
+function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer)
+    # TODO: Vector of Vector type data structure does not work on GPUs,
+    # must be redesigned. This skeleton implementation here just exists just
+    # for compatibility with the rest of the KA.jl solver code
+
+    _u = adapt(to, mpi_mortars._u)
+    _node_indices = mpi_mortars._node_indices
+    _normal_directions = mpi_mortars._normal_directions
+
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_mortars.u))
+    local_neighbor_ids = mpi_mortars.local_neighbor_ids
+    local_neighbor_positions = mpi_mortars.local_neighbor_positions
+    node_indices = mpi_mortars.node_indices
+    normal_directions = mpi_mortars.normal_directions
+
+    NDIMS = ndims(mpi_mortars)
+    return P4estMPIMortarContainer{NDIMS, eltype(_u),
+                                   eltype(_normal_directions),
+                                   NDIMS + 1, NDIMS + 2, NDIMS + 3,
+                                   typeof(u), typeof(_u)}(u, local_neighbor_ids,
+                                                          local_neighbor_positions,
+                                                          node_indices,
+                                                          normal_directions, _u,
+                                                          _node_indices,
+                                                          _normal_directions)
+end
+
 # Overload init! function for regular interfaces, regular mortars and boundaries since they must
 # call the appropriate init_surfaces! function for parallel p4est meshes
 function init_interfaces!(interfaces, mesh::ParallelP4estMesh)
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index 2cc201dd1f0..7acddf07b4b 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -5,12 +5,13 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPICache{uEltype}
+mutable struct P4estMPICache{BufferType <: DenseVector,
+                             VecInt <: DenseVector{<:Integer}}
     mpi_neighbor_ranks::Vector{Int}
-    mpi_neighbor_interfaces::Vector{Vector{Int}}
-    mpi_neighbor_mortars::Vector{Vector{Int}}
-    mpi_send_buffers::Vector{Vector{uEltype}}
-    mpi_recv_buffers::Vector{Vector{uEltype}}
+    mpi_neighbor_interfaces::VecOfArrays{VecInt}
+    mpi_neighbor_mortars::VecOfArrays{VecInt}
+    mpi_send_buffers::VecOfArrays{BufferType}
+    mpi_recv_buffers::VecOfArrays{BufferType}
     mpi_send_requests::Vector{MPI.Request}
     mpi_recv_requests::Vector{MPI.Request}
     n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
@@ -25,25 +26,29 @@ function P4estMPICache(uEltype)
     end
 
     mpi_neighbor_ranks = Vector{Int}(undef, 0)
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0)
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0)
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0)
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0)
+    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
+    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
     mpi_send_requests = Vector{MPI.Request}(undef, 0)
     mpi_recv_requests = Vector{MPI.Request}(undef, 0)
     n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1)
     n_elements_global = 0
     first_element_global_id = 0
 
-    P4estMPICache{uEltype}(mpi_neighbor_ranks, mpi_neighbor_interfaces,
-                           mpi_neighbor_mortars,
-                           mpi_send_buffers, mpi_recv_buffers,
-                           mpi_send_requests, mpi_recv_requests,
-                           n_elements_by_rank, n_elements_global,
-                           first_element_global_id)
+    P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks,
+                                                mpi_neighbor_interfaces,
+                                                mpi_neighbor_mortars,
+                                                mpi_send_buffers, mpi_recv_buffers,
+                                                mpi_send_requests, mpi_recv_requests,
+                                                n_elements_by_rank, n_elements_global,
+                                                first_element_global_id)
 end
 
-@inline Base.eltype(::P4estMPICache{uEltype}) where {uEltype} = uEltype
+@inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType)
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(P4estMPICache)
 
 ##
 # Note that the code in `start_mpi_send`/`finish_mpi_receive!` is sensitive to inference on (at least) Julia 1.10.
@@ -265,16 +270,16 @@ end
 
 function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
                          mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype)
-    mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
-                                                                                                       mpi_mortars,
-                                                                                                       mesh)
+    mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
+                                                                                                         mpi_mortars,
+                                                                                                         mesh)
 
-    mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces,
-                                                                                                        mpi_neighbor_mortars,
-                                                                                                        ndims(mesh),
-                                                                                                        nvars,
-                                                                                                        n_nodes,
-                                                                                                        uEltype)
+    _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces,
+                                                                                                          _mpi_neighbor_mortars,
+                                                                                                          ndims(mesh),
+                                                                                                          nvars,
+                                                                                                          n_nodes,
+                                                                                                          uEltype)
 
     # Determine local and total number of elements
     n_elements_global = Int(mesh.p4est.global_num_quadrants[])
@@ -286,6 +291,11 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
     first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1
     @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements"
 
+    mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces)
+    mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars)
+    mpi_send_buffers = VecOfArrays(_mpi_send_buffers)
+    mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers)
+
     # TODO reuse existing structures
     @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces,
                        mpi_neighbor_mortars,
diff --git a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
index 0cb3bd7f409..d6cf6e1ce6d 100644
--- a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
+++ b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
@@ -13,9 +13,10 @@ It stores a set of global indices for each boundary condition type and name to e
 during the call to `calc_boundary_flux!`. The original dictionary form of the boundary conditions
 set by the user in the elixir file is also stored for printing.
 """
-mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any}}
+mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any},
+                                               Vec <: AbstractVector{<:Integer}}
     boundary_condition_types::BCs # specific boundary condition type(s), e.g. BoundaryConditionDirichlet
-    boundary_indices::NTuple{N, Vector{Int}} # integer vectors containing global boundary indices
+    boundary_indices::NTuple{N, Vec} # integer vectors containing global boundary indices
     boundary_dictionary::Dict{Symbol, Any} # boundary conditions as set by the user in the elixir file
     boundary_symbol_indices::Dict{Symbol, Vector{Int}} # integer vectors containing global boundary indices per boundary identifier
 end
@@ -33,10 +34,11 @@ function UnstructuredSortedBoundaryTypes(boundary_conditions::Dict, cache)
     boundary_symbol_indices = Dict{Symbol, Vector{Int}}()
 
     container = UnstructuredSortedBoundaryTypes{n_boundary_types,
-                                                typeof(boundary_condition_types)}(boundary_condition_types,
-                                                                                  boundary_indices,
-                                                                                  boundary_conditions,
-                                                                                  boundary_symbol_indices)
+                                                typeof(boundary_condition_types),
+                                                Vector{Int}}(boundary_condition_types,
+                                                             boundary_indices,
+                                                             boundary_conditions,
+                                                             boundary_symbol_indices)
 
     initialize!(container, cache)
 end
@@ -119,4 +121,7 @@ function initialize!(boundary_types_container::UnstructuredSortedBoundaryTypes{N
 
     return boundary_types_container
 end
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(UnstructuredSortedBoundaryTypes)
 end # @muladd
diff --git a/test/Project.toml b/test/Project.toml
index cd1c122a18a..94683d362f5 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 577344d1a4a..7425d243111 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -27,6 +27,12 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(semi32.mesh) == Float64
 end
 
 @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
diff --git a/test/test_unstructured_2d.jl b/test/test_unstructured_2d.jl
index 07a79f883d3..0d13ecaa821 100644
--- a/test/test_unstructured_2d.jl
+++ b/test/test_unstructured_2d.jl
@@ -2,6 +2,7 @@ module TestExamplesUnstructuredMesh2D
 
 using Test
 using Trixi
+using Adapt
 
 include("test_trixi.jl")
 
@@ -32,6 +33,12 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh as well
+    @test real(semi32.mesh) == Float64
 end
 
 @trixi_testset "elixir_euler_free_stream.jl" begin

From cf2f5905a8ac55427c14666f742e8bc9001c31c0 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 21 Apr 2025 18:37:41 +0200
Subject: [PATCH 15/81] add docs and CUDAExt

---
 Project.toml              |  7 +++-
 docs/make.jl              |  3 +-
 docs/src/heterogeneous.md | 82 +++++++++++++++++++++++++++++++++++++++
 ext/TrixiCUDAExt.jl       | 11 ++++++
 4 files changed, 100 insertions(+), 3 deletions(-)
 create mode 100644 docs/src/heterogeneous.md
 create mode 100644 ext/TrixiCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index e10c47ff1be..0c53ef69666 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,8 +4,8 @@ authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "
 version = "0.12.5-DEV"
 
 [deps]
-Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -57,15 +57,18 @@ Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
+TrixiCUDAExt = "CUDA"
 
 [compat]
-Adapt = "4"
 Accessors = "0.1.36"
+Adapt = "4"
+CUDA = "5"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
diff --git a/docs/make.jl b/docs/make.jl
index 7111b66ab94..0301f5ba64e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -163,7 +163,8 @@ makedocs(
                  "Style guide" => "styleguide.md",
                  "Testing" => "testing.md",
                  "Performance" => "performance.md",
-                 "Parallelization" => "parallelization.md"
+                 "Parallelization" => "parallelization.md",
+                 "Heterogeneous" => "heterogeneous.md"
              ],
              "Troubleshooting and FAQ" => "troubleshooting.md",
              "Reference" => [
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
new file mode 100644
index 00000000000..60bda029a40
--- /dev/null
+++ b/docs/src/heterogeneous.md
@@ -0,0 +1,82 @@
+# Heterogeneous computing
+
+Support for heterogeneous computing is currently being worked on.
+
+## The use of Adapt.jl
+
+[`Adapt.jl`](https://github.com/JuliaGPU/Adapt.jl) is a package in the JuliaGPU family that allows for
+the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
+at the storage leaves with a GPU array like `CuArray`.
+
+To facilitate this data structures must be parameterized, so instead of:
+
+```julia
+struct Container
+   data::Array{Float64,2}
+end
+```
+
+They must be written as:
+
+```julia
+struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+   data::D
+end
+```
+
+furthermore, we need to define a function that allows for the conversion of storage
+of our types: 
+
+```julia
+function Adapt.adapt_structure(to, C::Container)
+    return Container(adapt(to, C.data))
+end
+```
+
+or simply
+
+```julia
+Adapt.@adapt_structure(Container)
+```
+
+additionally, we must define `Adapt.parent_type`.
+
+```julia
+function Adapt.parent_type(::Type{<:Container{D}}) where D
+    return D
+end
+```
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+Array
+
+julia> using CUDA
+
+julia> GPU_C = adapt(CuArray, C)
+Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+CuArray
+```
+
+## Element-type conversion with `Trixi.trixi_adapt`.
+
+We can use Trixi.trixi_adapt to perform both an element-type and a storage-type adoption
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(Array, Float32, C)
+Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(CuArray, Float32, C)
+Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0])
+```
+
+!!! note
+    `adapt(Array{Float32}, C)` is tempting but will do the wrong thing in the presence of `StaticArrays`.
\ No newline at end of file
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
new file mode 100644
index 00000000000..681d2f53a1e
--- /dev/null
+++ b/ext/TrixiCUDAExt.jl
@@ -0,0 +1,11 @@
+# Package extension for adding CUDA-based features to Trixi.jl
+module TrixiCUDAExt
+
+import CUDA: CuArray
+import Trixi
+
+function Trixi.storage_type(::Type{<:CuArray})
+    return CuArray
+end
+
+end

From de96f850444b875767a114921569be25df027d1e Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 21 Apr 2025 21:35:04 +0200
Subject: [PATCH 16/81] Aqua set unbound_args

---
 test/test_aqua.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_aqua.jl b/test/test_aqua.jl
index 9b3f2d67903..154088995ca 100644
--- a/test/test_aqua.jl
+++ b/test/test_aqua.jl
@@ -10,6 +10,7 @@ include("test_trixi.jl")
 @timed_testset "Aqua.jl" begin
     Aqua.test_all(Trixi,
                   ambiguities = false,
+                  unbound_args = false, # FIXME: UnstructuredSortedBoundaryTypes
                   # exceptions necessary for adding a new method `StartUpDG.estimate_h`
                   # in src/solvers/dgmulti/sbp.jl
                   piracies = (treat_as_own = [Trixi.StartUpDG.RefElemData,

From 1a7cff2673a2111ba6e143c757276ededf1e69a7 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 09:26:23 +0200
Subject: [PATCH 17/81] lower bound CUDA to 5.2

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 0c53ef69666..689f054adf0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -68,7 +68,7 @@ TrixiCUDAExt = "CUDA"
 [compat]
 Accessors = "0.1.36"
 Adapt = "4"
-CUDA = "5"
+CUDA = "5.2"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"

From 68edf29cc2a66669038e0b15e7bf1db19ca3a9c6 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 21 Apr 2025 17:16:18 +0200
Subject: [PATCH 18/81] add initial CUDA pipeline

---
 .buildkite/pipeline.yml |  9 ++++++---
 test/Project.toml       |  1 +
 test/runtests.jl        |  9 +++++++++
 test/test_cuda.jl       | 20 ++++++++++++++++++++
 4 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 test/test_cuda.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 0f8ad475db8..344b8eacc3a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,3 +1,5 @@
+env:
+
 steps:
   - label: "CUDA Julia {{matrix.version}}"
     matrix:
@@ -7,12 +9,13 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
-    command: |
-      true
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "CUDA"
     agents:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip ci\]/
     timeout_in_minutes: 60
     soft_fail:
-      - exit_status: 3
\ No newline at end of file
+      - exit_status: 3
diff --git a/test/Project.toml b/test/Project.toml
index 94683d362f5..78b35c6b2de 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -4,6 +4,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
diff --git a/test/runtests.jl b/test/runtests.jl
index db2c2e9dd88..8f35e1fb58d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -109,4 +109,13 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "paper_self_gravitating_gas_dynamics"
         include("test_paper_self_gravitating_gas_dynamics.jl")
     end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
+        import CUDA
+        if CUDA.functional()
+            include("test_cuda.jl")
+        else
+            @warn "Unable to run CUDA tests on this machine"
+        end
+    end
 end
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
new file mode 100644
index 00000000000..f2fd11233c6
--- /dev/null
+++ b/test/test_cuda.jl
@@ -0,0 +1,20 @@
+module TestCUDA
+
+using CUDA
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# EXAMPLES_DIR = joinpath(examples_dir(), "dgmulti_1d")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+# TODO:
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+
+end # module

From 11ff63aade34a8d3be33bc0d46da9ef8f356db83 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 10:08:37 +0200
Subject: [PATCH 19/81] add storage_type, real_type to semidiscretize

---
 .../p4est_2d_dgsem/elixir_advection_basic.jl  |  2 +-
 src/semidiscretization/semidiscretization.jl  | 21 ++++++++++++++++++-
 test/test_p4est_2d.jl                         | 21 +++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic.jl b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
index 4ff646365aa..e162e8997f2 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
@@ -31,7 +31,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0))
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index cc3900d42da..97c50aa46a1 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -82,9 +82,15 @@ end
 
 Wrap the semidiscretization `semi` as an ODE problem in the time interval `tspan`
 that can be passed to `solve` from the [SciML ecosystem](https://diffeq.sciml.ai/latest/).
+
+The optional keyword arguments `storage_type` and `real_type` configure the underlying computational
+datastructures. `storage_type` changes the fundamental array type being used, allowing the
+experimental use of `CuArray` or other GPU array types. `real_type` changes the computational data type being used.
 """
 function semidiscretize(semi::AbstractSemidiscretization, tspan;
-                        reset_threads = true)
+                        reset_threads = true,
+                        storage_type = nothing,
+                        real_type = nothing)
     # Optionally reset Polyester.jl threads. See
     # https://github.com/trixi-framework/Trixi.jl/issues/1583
     # https://github.com/JuliaSIMD/Polyester.jl/issues/30
@@ -92,6 +98,19 @@ function semidiscretize(semi::AbstractSemidiscretization, tspan;
         Polyester.reset_threads!()
     end
 
+    if !(storage_type === nothing && real_type === nothing)
+        if storage_type === nothing
+            storage_type = Array
+        end
+        if real_type === nothing
+            real_type = Float64
+        end
+        semi = trixi_adapt(storage_type, real_type, semi)
+        if eltype(tspan) !== real_type
+            tspan = convert.(real_type, tspan)
+        end
+    end
+
     u0_ode = compute_coefficients(first(tspan), semi)
     # TODO: MPI, do we want to synchronize loading and print debug statements, e.g. using
     #       mpi_isparallel() && MPI.Barrier(mpi_comm())
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 7425d243111..307d70683a5 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -35,6 +35,27 @@ isdir(outdir) && rm(outdir, recursive = true)
     @test real(semi32.mesh) == Float64
 end
 
+@trixi_testset "elixir_advection_basic.jl (Float32)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[8.311947673061856e-6],
+                        linf=[6.627000273229378e-5],
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+end
+
 @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_advection_nonconforming_flag.jl"),

From 4d8a31f0a1f4e08cd72262e90313d862e64f40b1 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 10:25:33 +0200
Subject: [PATCH 20/81] add GPU construction test

---
 .../elixir_advection_basic_gpu.jl             | 60 +++++++++++++++++++
 test/test_cuda.jl                             | 24 +++++++-
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
new file mode 100644
index 00000000000..4e26ec3df1a
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,60 @@
+# The same setup as tree_2d_dgsem/elixir_advection_basic.jl
+# to verify the StructuredMesh implementation against TreeMesh
+
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
+
+trees_per_dimension = (8, 8)
+
+# Create P4estMesh with 8 x 8 trees and 16 x 16 elements
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+# sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+#             dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+#             ode_default_options()..., callback = callbacks);
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index f2fd11233c6..68872266986 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -12,7 +12,29 @@ include("test_trixi.jl")
 outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
-# TODO:
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+@trixi_testset "elixir_advection_basic.jl (Float32)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[8.311947673061856e-6],
+                        linf=[6.627000273229378e-5],
+                        real_type=Float32,
+                        storage_type=CuArray)
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+end
 
 # Clean up afterwards: delete Trixi.jl output directory
 @test_nowarn isdir(outdir) && rm(outdir, recursive = true)

From 6ca8c3d0359fa49efb55313ef0f63ad3cccd26a4 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 12:08:26 +0200
Subject: [PATCH 21/81] don't adapt Array{MArray}

---
 src/auxiliary/containers.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 5738467ec6b..edc42db382b 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -388,6 +388,13 @@ function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
     adapt(Storage{StaticArrays.similar_type(T, Real)}, x)
 end
 
+# Our threaded cache contains MArray, it is unlikely that we would want to adapt those
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::Array{T}) where {Storage, Real,
+                                                 T <: StaticArrays.MArray}
+    adapt(Array{StaticArrays.similar_type(T, Real)}, x)
+end
+
 function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
                              x::AbstractArray) where {Storage, Real}
     adapt(Storage, x)

From 4ef2d98bd6d9ad4a3a50bec15ab82c8d6138f640 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Apr 2025 13:36:22 +0200
Subject: [PATCH 22/81] add some more cuda adapt tests

---
 test/test_cuda.jl | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 68872266986..7a218f236d3 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -19,7 +19,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
                         # Expected errors are exactly the same as with TreeMesh!
                         l2=[8.311947673061856e-6],
                         linf=[6.627000273229378e-5],
-                        real_type=Float32,
+                        real_type=Float64,
                         storage_type=CuArray)
     # # Ensure that we do not have excessive memory allocations
     # # (e.g., from type instabilities)
@@ -34,6 +34,17 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
     @test real(ode.p.solver.mortar) == Float32
     # TODO: remake ignores the mesh itself as well
     @test real(ode.p.mesh) == Float64
+
+    @test_broken ode.u0 isa CuArray
+    @test ode.p.basis.boundary_interpolations isa CuArray
+    @test ode.p.basis.derivative_matrix isa CuArray
+
+    @test ode.p.basis.forward_upper isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortrar) === CuArray
 end
 
 # Clean up afterwards: delete Trixi.jl output directory

From 77395f5ecf581493fd76b5112f8ca8283f5df487 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 28 Apr 2025 16:18:18 +0200
Subject: [PATCH 23/81] use sources for dev branch

---
 .buildkite/pipeline.yml | 2 +-
 test/Project.toml       | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 344b8eacc3a..fdb4a855961 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -5,7 +5,7 @@ steps:
     matrix:
       setup:
         version:
-          - "1.10"
+          - "1.11"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
diff --git a/test/Project.toml b/test/Project.toml
index 78b35c6b2de..df66fe98966 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -59,3 +59,6 @@ Random = "1"
 StableRNGs = "1.0.2"
 Test = "1"
 TrixiTest = "0.1"
+
+[sources]
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "vc/unsafe_wrap_symbols"}

From 1d78f077d471f9f92fa135ce05f2edd39f0e1df9 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 8 May 2025 11:50:42 +0200
Subject: [PATCH 24/81] fixup! use sources for dev branch

---
 test/Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Project.toml b/test/Project.toml
index df66fe98966..ff6de774355 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -61,4 +61,4 @@ Test = "1"
 TrixiTest = "0.1"
 
 [sources]
-CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "vc/unsafe_wrap_symbols"}
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}

From 39535eec0bdd68f1bb21bfcd565f022f44e96c3a Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 14 May 2025 10:38:54 +0200
Subject: [PATCH 25/81] use released version of CUDA

---
 .github/workflows/GPUCompat.yml | 86 ---------------------------------
 Project.toml                    |  2 +-
 test/Project.toml               |  3 --
 3 files changed, 1 insertion(+), 90 deletions(-)
 delete mode 100644 .github/workflows/GPUCompat.yml

diff --git a/.github/workflows/GPUCompat.yml b/.github/workflows/GPUCompat.yml
deleted file mode 100644
index 335e1c83c4c..00000000000
--- a/.github/workflows/GPUCompat.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: GPU Package Compatibility
-
-on:
-  pull_request:
-    paths-ignore:
-      - 'AUTHORS.md'
-      - 'CITATION.bib'
-      - 'CONTRIBUTING.md'
-      - 'LICENSE.md'
-      - 'NEWS.md'
-      - 'README.md'
-      - '.zenodo.json'
-      - '.github/workflows/benchmark.yml'
-      - '.github/workflows/CompatHelper.yml'
-      - '.github/workflows/TagBot.yml'
-      - 'benchmark/**'
-      - 'docs/**'
-      - 'utils/**'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    if: "!contains(github.event.head_commit.message, 'skip ci')"
-    name: ${{ matrix.os }} - ${{ matrix.arch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - version: '1.10'
-            os: ubuntu-latest
-            arch: x64
-          - version: '1.10'
-            os: windows-latest
-            arch: x64
-          # CUDA.jl only supports 64-bit Linux and Windows, see https://github.com/JuliaGPU/CUDA.jl?tab=readme-ov-file#requirements
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Julia
-        uses: julia-actions/setup-julia@v2
-        with:
-          version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
-
-      - name: Display version info
-        run: julia -e 'using InteractiveUtils; versioninfo(verbose=true)'
-
-      - name: Cache Julia packages
-        uses: julia-actions/cache@v2
-
-      - name: Build project
-        uses: julia-actions/julia-buildpkg@v1
-
-      # Only CUDA.jl is needed for GPU compatibility test now
-      - name: Add CUDA.jl to environment
-        run: |
-          julia --project=. -e '
-          using Pkg; 
-          Pkg.activate(temp=true);
-          Pkg.develop(PackageSpec(path=pwd())); 
-          Pkg.add("CUDA");
-          Pkg.update()'
-
-    #   - name: Add Metal.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("Metal");
-    #       Pkg.update()'
-
-    #   - name: Add AMDGPU.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("AMDGPU");
-    #       Pkg.update()'
diff --git a/Project.toml b/Project.toml
index 689f054adf0..ea207a63cbe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -68,7 +68,7 @@ TrixiCUDAExt = "CUDA"
 [compat]
 Accessors = "0.1.36"
 Adapt = "4"
-CUDA = "5.2"
+CUDA = "5.8"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
diff --git a/test/Project.toml b/test/Project.toml
index ff6de774355..78b35c6b2de 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -59,6 +59,3 @@ Random = "1"
 StableRNGs = "1.0.2"
 Test = "1"
 TrixiTest = "0.1"
-
-[sources]
-CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}

From b973758daa699c84be8e1e444f0b5cab0e74e1ab Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 14 May 2025 10:43:30 +0200
Subject: [PATCH 26/81] Update .buildkite/pipeline.yml

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index fdb4a855961..344b8eacc3a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -5,7 +5,7 @@ steps:
     matrix:
       setup:
         version:
-          - "1.11"
+          - "1.10"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"

From 7105da72985c927b12200d775413e400101854e6 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 30 Jun 2025 14:01:15 +0200
Subject: [PATCH 27/81] fix test_p4est_2d

---
 test/test_p4est_2d.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 307d70683a5..33d24c8d67e 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -38,8 +38,9 @@ end
 @trixi_testset "elixir_advection_basic.jl (Float32)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
-                        l2=[8.311947673061856e-6],
-                        linf=[6.627000273229378e-5],
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT=Float32,
                         real_type=Float32)
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
@@ -47,7 +48,7 @@ end
         t = sol.t[end]
         u_ode = sol.u[end]
         du_ode = similar(u_ode)
-        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+        @test_broken (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
     @test real(ode.p.solver) == Float32
     @test real(ode.p.solver.basis) == Float32

From 1fd6fe6614ebe799da375e3cf15569634ca4fb13 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 30 Jun 2025 21:12:08 +0200
Subject: [PATCH 28/81] fix first GPU test

---
 test/test_cuda.jl | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 7a218f236d3..1f96d8c863e 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -1,25 +1,27 @@
 module TestCUDA
 
-using CUDA
 using Test
 using Trixi
 
 include("test_trixi.jl")
 
-# EXAMPLES_DIR = joinpath(examples_dir(), "dgmulti_1d")
-
 # Start with a clean environment: remove Trixi.jl output directory if it exists
 outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
 EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
 
-@trixi_testset "elixir_advection_basic.jl (Float32)" begin
-    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+@trixi_testset "elixir_advection_basic_gpu.jl" begin
+    # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using CUDA
+    # TODO(benegee/vchuravy): Port compute_coefficients! to KernelAbstractions.jl
+    CUDA.allowscalar(true)
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
-                        l2=[8.311947673061856e-6],
-                        linf=[6.627000273229378e-5],
-                        real_type=Float64,
+                        l2=nothing,   # [Float32(8.311947673061856e-6)],
+                        linf=nothing, # [Float32(6.627000273229378e-5)],
+                        RealT=Float32,
+                        real_type=Float32,
                         storage_type=CuArray)
     # # Ensure that we do not have excessive memory allocations
     # # (e.g., from type instabilities)
@@ -36,15 +38,12 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
     @test real(ode.p.mesh) == Float64
 
     @test_broken ode.u0 isa CuArray
-    @test ode.p.basis.boundary_interpolations isa CuArray
-    @test ode.p.basis.derivative_matrix isa CuArray
-
-    @test ode.p.basis.forward_upper isa CuArray
+    @test ode.p.solver.basis.derivative_matrix isa CuArray
 
     @test Trixi.storage_type(ode.p.cache.elements) === CuArray
     @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
     @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
-    @test Trixi.storage_type(ode.p.cache.mortrar) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === CuArray
 end
 
 # Clean up afterwards: delete Trixi.jl output directory

From 6ceef3af12898f74a12bbfe2359ca2a805fc51dd Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 1 Jul 2025 09:18:34 +0200
Subject: [PATCH 29/81] address review comments

---
 src/solvers/dgsem_p4est/containers.jl         | 40 +++++++++++--------
 .../dgsem_p4est/containers_parallel.jl        |  7 ++--
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index 68e5b3d758b..3da09b5db55 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -60,30 +60,38 @@ function Base.resize!(elements::P4estElementContainer, capacity)
     ArrayType = storage_type(elements)
 
     resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity)
-    elements.node_coordinates = unsafe_wrap(ArrayType, pointer(_node_coordinates),
-                                            (n_dims, ntuple(_ -> n_nodes, n_dims)...,
-                                             capacity))
+    elements.node_coordinates = unsafe_wrap_or_alloc(ArrayType,
+                                                     pointer(_node_coordinates),
+                                                     (n_dims,
+                                                      ntuple(_ -> n_nodes, n_dims)...,
+                                                      capacity))
 
     resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity)
-    elements.jacobian_matrix = unsafe_wrap(ArrayType, pointer(_jacobian_matrix),
-                                           (n_dims, n_dims,
-                                            ntuple(_ -> n_nodes, n_dims)..., capacity))
+    elements.jacobian_matrix = unsafe_wrap_or_alloc(ArrayType,
+                                                    pointer(_jacobian_matrix),
+                                                    (n_dims, n_dims,
+                                                     ntuple(_ -> n_nodes, n_dims)...,
+                                                     capacity))
 
     resize!(_contravariant_vectors, length(_jacobian_matrix))
-    elements.contravariant_vectors = unsafe_wrap(ArrayType,
-                                                 pointer(_contravariant_vectors),
-                                                 size(elements.jacobian_matrix))
+    elements.contravariant_vectors = unsafe_wrap_or_alloc(ArrayType,
+                                                          pointer(_contravariant_vectors),
+                                                          size(elements.jacobian_matrix))
 
     resize!(_inverse_jacobian, n_nodes^n_dims * capacity)
-    elements.inverse_jacobian = unsafe_wrap(ArrayType, pointer(_inverse_jacobian),
-                                            (ntuple(_ -> n_nodes, n_dims)..., capacity))
+    elements.inverse_jacobian = unsafe_wrap_or_alloc(ArrayType,
+                                                     pointer(_inverse_jacobian),
+                                                     (ntuple(_ -> n_nodes, n_dims)...,
+                                                      capacity))
 
     resize!(_surface_flux_values,
             n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity)
-    elements.surface_flux_values = unsafe_wrap(ArrayType, pointer(_surface_flux_values),
-                                               (n_variables,
-                                                ntuple(_ -> n_nodes, n_dims - 1)...,
-                                                n_dims * 2, capacity))
+    elements.surface_flux_values = unsafe_wrap_or_alloc(ArrayType,
+                                                        pointer(_surface_flux_values),
+                                                        (n_variables,
+                                                         ntuple(_ -> n_nodes,
+                                                                n_dims - 1)...,
+                                                         n_dims * 2, capacity))
 
     return nothing
 end
@@ -221,7 +229,7 @@ end
 @inline Base.ndims(::P4estInterfaceContainer{NDIMS}) where {NDIMS} = NDIMS
 @inline function Base.eltype(::P4estInterfaceContainer{NDIMS, uEltype}) where {NDIMS,
                                                                                uEltype}
-    uEltype
+    return uEltype
 end
 
 # See explanation of Base.resize! for the element container
diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl
index cb9cd1ffc95..123337d8c0a 100644
--- a/src/solvers/dgsem_p4est/containers_parallel.jl
+++ b/src/solvers/dgsem_p4est/containers_parallel.jl
@@ -222,9 +222,10 @@ function init_mpi_mortars!(mpi_mortars, mesh::ParallelP4estMesh, basis, elements
 end
 
 function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer)
-    # TODO: Vector of Vector type data structure does not work on GPUs,
-    # must be redesigned. This skeleton implementation here just exists just
-    # for compatibility with the rest of the KA.jl solver code
+    # Only parts of this container are adapted, since we currently don't
+    # use `local_neighbor_ids`, `local_neighbor_positions`, `normal_directions`
+    # on the GPU. If we do need them we need to redesign this to use the VecOfArrays
+    # approach.
 
     _u = adapt(to, mpi_mortars._u)
     _node_indices = mpi_mortars._node_indices

From 7a53362dfac0d03e6dbad2fb47bd4a6839e90d3e Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 15:08:30 +0200
Subject: [PATCH 30/81] offload compute_coefficients

---
 Project.toml                                  |  2 +
 .../elixir_advection_basic_gpu.jl             | 18 ++++---
 src/Trixi.jl                                  |  1 +
 src/auxiliary/containers.jl                   |  4 ++
 src/semidiscretization/semidiscretization.jl  |  3 +-
 src/solvers/dg.jl                             | 47 +++++++++++++++----
 6 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/Project.toml b/Project.toml
index ea207a63cbe..7bea3abf0f9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,6 +17,7 @@ EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -82,6 +83,7 @@ EllipsisNotation = "1.0"
 FillArrays = "1.9"
 ForwardDiff = "0.10.36, 1"
 HDF5 = "0.16.10, 0.17"
+KernelAbstractions = "0.9"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 4e26ec3df1a..5f34784ddf9 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -1,8 +1,6 @@
-# The same setup as tree_2d_dgsem/elixir_advection_basic.jl
-# to verify the StructuredMesh implementation against TreeMesh
-
-using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using OrdinaryDiffEqLowStorageRK
 using Trixi
+using CUDA
 
 ###############################################################################
 # semidiscretization of the linear advection equation
@@ -31,7 +29,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = CuArray)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
@@ -48,13 +46,13 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
-                        stepsize_callback)
+callbacks = CallbackSet(summary_callback)
+# analysis_callback, save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation
 
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
-# sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-#             dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
-#             ode_default_options()..., callback = callbacks);
+ sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+             dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+             ode_default_options()..., callback = callbacks);
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a52dfd6d973..7836f1938b1 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -59,6 +59,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
+using KernelAbstractions
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index edc42db382b..40aff873956 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -405,4 +405,8 @@ end
 function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
     return unsafe_wrap_or_alloc(Storage, vec, size)
 end
+
+function KernelAbstractions.get_backend(semi::AbstractSemidiscretization)
+    KernelAbstractions.get_backend(semi.cache.elements.node_coordinates)
+end
 end # @muladd
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index 97c50aa46a1..e214f569d13 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -176,7 +176,8 @@ Same as [`compute_coefficients`](@ref) but stores the result in `u_ode`.
 function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization)
     u = wrap_array(u_ode, semi)
     # Call `compute_coefficients` defined by the solver
-    compute_coefficients!(u, func, t, mesh_equations_solver_cache(semi)...)
+    backend = get_backend(semi)
+    compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...)
 end
 
 """
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 78f3901a346..273cc8f7a47 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -642,8 +642,10 @@ include("fdsbp_unstructured/fdsbp.jl")
 function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache)
     # We must allocate a `Vector` in order to be able to `resize!` it (AMR).
     # cf. wrap_array
-    zeros(eltype(cache.elements),
-          nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+    u_ode = similar(cache.elements.node_coordinates,
+                    nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+    fill!(u_ode, zero(eltype(u_ode)))
+    return u_ode
 end
 
 @inline function wrap_array(u_ode::AbstractVector, mesh::AbstractMesh, equations,
@@ -686,7 +688,8 @@ end
         #  (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
     else
         # The following version is reasonably fast and allows us to `resize!(u_ode, ...)`.
-        unsafe_wrap(Array{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode),
+        ArrayType = Trixi.storage_type(u_ode)
+        unsafe_wrap(ArrayType{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode),
                     (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))...,
                      nelements(dg, cache)))
     end
@@ -756,15 +759,39 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
+function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
                                cache)
+    @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
-                                     j, element)
-            u_node = func(x_node, t, equations)
-            set_node_vars!(u, u_node, equations, dg, i, j, element)
-        end
+        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+    end
+end
+
+function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
+                               equations, dg::DG, cache)
+    nelements(dg, cache) == 0 && return nothing
+    # 1 cache not as argument
+    # 2 mesh not
+    @unpack node_coordinates = cache.elements
+    kernel! = compute_coefficients_kernel!(backend)
+    kernel!(u, func, t, equations, dg, node_coordinates,
+            ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function compute_coefficients_kernel!(u, func, t, equations,
+                                              dg::DG, node_coordinates)
+    element = @index(Global)
+    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+end
+
+function compute_coefficients_element!(u, func, t, equations, dg::DG,
+                                       node_coordinates, element)
+    for j in eachnode(dg), i in eachnode(dg)
+        x_node = get_node_coords(node_coordinates, equations, dg, i,
+                                    j, element)
+        u_node = func(x_node, t, equations)
+        set_node_vars!(u, u_node, equations, dg, i, j, element)
     end
 end
 

From 68eb9052d11244268e1b1929295dec6bcfe8c070 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 15:16:07 +0200
Subject: [PATCH 31/81] fmt

---
 examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl |  6 +++---
 src/solvers/dg.jl                                     | 11 +++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 5f34784ddf9..b5291ea2862 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -53,6 +53,6 @@ callbacks = CallbackSet(summary_callback)
 # run the simulation
 
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
- sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-             dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
-             ode_default_options()..., callback = callbacks);
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 273cc8f7a47..756036a0e55 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -643,7 +643,8 @@ function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache)
     # We must allocate a `Vector` in order to be able to `resize!` it (AMR).
     # cf. wrap_array
     u_ode = similar(cache.elements.node_coordinates,
-                    nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+                    nvariables(equations) * nnodes(dg)^ndims(mesh) *
+                    nelements(dg, cache))
     fill!(u_ode, zero(eltype(u_ode)))
     return u_ode
 end
@@ -759,11 +760,13 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
+function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations,
+                               dg::DG,
                                cache)
     @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
-        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
+                                      element)
     end
 end
 
@@ -789,7 +792,7 @@ function compute_coefficients_element!(u, func, t, equations, dg::DG,
                                        node_coordinates, element)
     for j in eachnode(dg), i in eachnode(dg)
         x_node = get_node_coords(node_coordinates, equations, dg, i,
-                                    j, element)
+                                 j, element)
         u_node = func(x_node, t, equations)
         set_node_vars!(u, u_node, equations, dg, i, j, element)
     end

From 3d00bdfec5d4da71f13196c82c93f0fb92da24da Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 1 Jul 2025 18:50:30 +0200
Subject: [PATCH 32/81] fixup! address review comments

---
 src/solvers/dgsem_p4est/containers.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index 3da09b5db55..83097f4a1ed 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -61,33 +61,33 @@ function Base.resize!(elements::P4estElementContainer, capacity)
 
     resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity)
     elements.node_coordinates = unsafe_wrap_or_alloc(ArrayType,
-                                                     pointer(_node_coordinates),
+                                                     _node_coordinates,
                                                      (n_dims,
                                                       ntuple(_ -> n_nodes, n_dims)...,
                                                       capacity))
 
     resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity)
     elements.jacobian_matrix = unsafe_wrap_or_alloc(ArrayType,
-                                                    pointer(_jacobian_matrix),
+                                                    _jacobian_matrix,
                                                     (n_dims, n_dims,
                                                      ntuple(_ -> n_nodes, n_dims)...,
                                                      capacity))
 
     resize!(_contravariant_vectors, length(_jacobian_matrix))
     elements.contravariant_vectors = unsafe_wrap_or_alloc(ArrayType,
-                                                          pointer(_contravariant_vectors),
+                                                          _contravariant_vectors,
                                                           size(elements.jacobian_matrix))
 
     resize!(_inverse_jacobian, n_nodes^n_dims * capacity)
     elements.inverse_jacobian = unsafe_wrap_or_alloc(ArrayType,
-                                                     pointer(_inverse_jacobian),
+                                                     _inverse_jacobian,
                                                      (ntuple(_ -> n_nodes, n_dims)...,
                                                       capacity))
 
     resize!(_surface_flux_values,
             n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity)
     elements.surface_flux_values = unsafe_wrap_or_alloc(ArrayType,
-                                                        pointer(_surface_flux_values),
+                                                        _surface_flux_values,
                                                         (n_variables,
                                                          ntuple(_ -> n_nodes,
                                                                 n_dims - 1)...,

From 4b32fa0a384de43a1d6c8a3d89b5993391ec54fc Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 1 Jul 2025 18:59:47 +0200
Subject: [PATCH 33/81] add review comments

---
 docs/src/heterogeneous.md                     | 25 ++++++++++++++-----
 .../elixir_advection_basic_gpu.jl             |  3 +++
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
index 60bda029a40..b4027abdd3a 100644
--- a/docs/src/heterogeneous.md
+++ b/docs/src/heterogeneous.md
@@ -4,15 +4,16 @@ Support for heterogeneous computing is currently being worked on.
 
 ## The use of Adapt.jl
 
-[`Adapt.jl`](https://github.com/JuliaGPU/Adapt.jl) is a package in the JuliaGPU family that allows for
+[Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) is a package in the
+[JuliaGPU](https://github.com/JuliaGPU) family that allows for
 the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
-at the storage leaves with a GPU array like `CuArray`.
+at the storage leaves with a GPU array like `CuArray` from [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
 
 To facilitate this data structures must be parameterized, so instead of:
 
 ```julia
-struct Container
-   data::Array{Float64,2}
+struct Container <: Trixi.AbstractContainer
+   data::Array{Float64, 2}
 end
 ```
 
@@ -47,7 +48,19 @@ function Adapt.parent_type(::Type{<:Container{D}}) where D
 end
 ```
 
-```julia-repl
+All together we can use this machinery to perform conversions of a container.
+
+```jldoctest
+julia> import Trixi, Adapt
+
+julia> struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+           data::D
+       end
+
+julia> Adapt.@adapt_structure(Container)
+
+julia> Adapt.parent_type(::Type{<:Container{D}}) where D = D
+
 julia> C = Container(zeros(3))
 Container{Vector{Float64}}([0.0, 0.0, 0.0])
 
@@ -65,7 +78,7 @@ CuArray
 
 ## Element-type conversion with `Trixi.trixi_adapt`.
 
-We can use Trixi.trixi_adapt to perform both an element-type and a storage-type adoption
+We can use [`Trixi.trixi_adapt`](@ref) to perform both an element-type and a storage-type adoption
 
 ```julia-repl
 julia> C = Container(zeros(3))
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 4e26ec3df1a..4c0f5744a88 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -54,6 +54,9 @@ callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
 ###############################################################################
 # run the simulation
 
+# TODO: Currently we can only construct the ODE problem on the GPU, but we cannot solve it on the GPU yet.
+#       Uncomment the calls below to discover missing functionality.
+
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
 # sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
 #             dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback

From 10f7593b3c08cbbfd69eefe893c07b7e1b8d5de7 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 19:25:28 +0200
Subject: [PATCH 34/81] convert fstar_* cache entries to VecOfArrays

---
 src/solvers/dgsem_p4est/dg_3d.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index e59f502c86c..4c099c9fd3f 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -13,18 +13,18 @@ function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations,
     fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
                                                 nnodes(mortar_l2),
                                                 nnodes(mortar_l2), 4)
-                              for _ in 1:Threads.nthreads()]
+                              for _ in 1:Threads.nthreads()] |> VecOfArrays
     fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
                                                   nnodes(mortar_l2),
                                                   nnodes(mortar_l2), 4)
-                                for _ in 1:Threads.nthreads()]
+                                for _ in 1:Threads.nthreads()] |> VecOfArrays
 
     fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations),
                                             nnodes(mortar_l2), nnodes(mortar_l2))
-                          for _ in 1:Threads.nthreads()]
+                          for _ in 1:Threads.nthreads()] |> VecOfArrays
     u_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2),
                                     nnodes(mortar_l2))
-                  for _ in 1:Threads.nthreads()]
+                  for _ in 1:Threads.nthreads()] |> VecOfArrays
 
     (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded)
 end

From c83bdbd59e401ebd2ebaf3eb5add9281cb2b62e5 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 19:33:07 +0200
Subject: [PATCH 35/81] restore elixir

---
 examples/p4est_2d_dgsem/elixir_advection_basic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic.jl b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
index e162e8997f2..4ff646365aa 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
@@ -31,7 +31,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+ode = semidiscretize(semi, (0.0, 1.0))
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers

From d3b94fcaee421bc22f233a0e68e373093585ce1c Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 2 Jul 2025 09:11:54 +0200
Subject: [PATCH 36/81] test native version as well

---
 .../elixir_advection_basic_gpu.jl             |  9 +++--
 src/Trixi.jl                                  |  1 +
 src/auxiliary/containers.jl                   |  8 +++++
 src/semidiscretization/semidiscretization.jl  |  2 +-
 src/solvers/dg.jl                             |  7 ++--
 test/test_cuda.jl                             | 35 ++++++++++++++++---
 6 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 8fd7c31a413..61277a2734f 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -1,6 +1,5 @@
 using OrdinaryDiffEqLowStorageRK
 using Trixi
-using CUDA
 
 ###############################################################################
 # semidiscretization of the linear advection equation
@@ -29,7 +28,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = CuArray)
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
@@ -56,6 +55,6 @@ callbacks = CallbackSet(summary_callback)
 #       Uncomment the calls below to discover missing functionality.
 
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
-sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
-            ode_default_options()..., callback = callbacks);
+#sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+#            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+#            ode_default_options()..., callback = callbacks);
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 7836f1938b1..18000e050bd 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -20,6 +20,7 @@ const _PREFERENCE_SQRT = @load_preference("sqrt", "sqrt_Trixi_NaN")
 const _PREFERENCE_LOG = @load_preference("log", "log_Trixi_NaN")
 const _PREFERENCE_POLYESTER = @load_preference("polyester", true)
 const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", true)
+const _PREFERENCE_USE_NATIVE_THREADING = @load_preference("native_threading", true)
 
 # Include other packages that are used in Trixi.jl
 # (standard library packages first, other packages next, all of them sorted alphabetically)
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 40aff873956..ac412eb2da8 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -406,6 +406,14 @@ function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage
     return unsafe_wrap_or_alloc(Storage, vec, size)
 end
 
+function trixi_backend(x)
+    backend = get_backend(x)
+    if _PREFERENCE_USE_NATIVE_THREADING && backend isa KernelAbstractions.CPU
+        backend = nothing
+    end
+    return backend
+end
+
 function KernelAbstractions.get_backend(semi::AbstractSemidiscretization)
     KernelAbstractions.get_backend(semi.cache.elements.node_coordinates)
 end
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index e214f569d13..b8f53237550 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -176,7 +176,7 @@ Same as [`compute_coefficients`](@ref) but stores the result in `u_ode`.
 function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization)
     u = wrap_array(u_ode, semi)
     # Call `compute_coefficients` defined by the solver
-    backend = get_backend(semi)
+    backend = trixi_backend(semi)
     compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...)
 end
 
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 756036a0e55..9ec37647c97 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -760,9 +760,8 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations,
-                               dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2},
+                               equations, dg::DG, cache)
     @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
         compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
@@ -773,8 +772,6 @@ end
 function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
                                equations, dg::DG, cache)
     nelements(dg, cache) == 0 && return nothing
-    # 1 cache not as argument
-    # 2 mesh not
     @unpack node_coordinates = cache.elements
     kernel! = compute_coefficients_kernel!(backend)
     kernel!(u, func, t, equations, dg, node_coordinates,
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 1f96d8c863e..c6904b41a9d 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -11,16 +11,41 @@ isdir(outdir) && rm(outdir, recursive = true)
 
 EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
 
-@trixi_testset "elixir_advection_basic_gpu.jl" begin
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=nothing,   # [Float32(8.311947673061856e-6)],
+                        linf=nothing,)
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin
     # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
     using CUDA
-    # TODO(benegee/vchuravy): Port compute_coefficients! to KernelAbstractions.jl
-    CUDA.allowscalar(true)
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
                         l2=nothing,   # [Float32(8.311947673061856e-6)],
                         linf=nothing, # [Float32(6.627000273229378e-5)],
-                        RealT=Float32,
                         real_type=Float32,
                         storage_type=CuArray)
     # # Ensure that we do not have excessive memory allocations
@@ -37,7 +62,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
     # TODO: remake ignores the mesh itself as well
     @test real(ode.p.mesh) == Float64
 
-    @test_broken ode.u0 isa CuArray
+    @test ode.u0 isa CuArray
     @test ode.p.solver.basis.derivative_matrix isa CuArray
 
     @test Trixi.storage_type(ode.p.cache.elements) === CuArray

From 97e13ec876c4ec3a95c5811b7a3c2eb35f87b9ce Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 2 Jul 2025 09:34:33 +0200
Subject: [PATCH 37/81] adapt 1D and 3D version

---
 src/solvers/dg.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 9ec37647c97..a9ed65d7070 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -739,8 +739,8 @@ end
                  nelements(dg, cache)))
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{1},
+                               equations, dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for i in eachnode(dg)
             x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
@@ -795,8 +795,8 @@ function compute_coefficients_element!(u, func, t, equations, dg::DG,
     end
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{3}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3},
+                               equations, dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,

From 44f7134b3745ed9603a6d59faa1e47b0d65e271b Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 2 Jul 2025 09:34:49 +0200
Subject: [PATCH 38/81] Downgrade compat with Adapt

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 7bea3abf0f9..f4af1d63f45 100644
--- a/Project.toml
+++ b/Project.toml
@@ -83,7 +83,7 @@ EllipsisNotation = "1.0"
 FillArrays = "1.9"
 ForwardDiff = "0.10.36, 1"
 HDF5 = "0.16.10, 0.17"
-KernelAbstractions = "0.9"
+KernelAbstractions = "0.9.15"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"

From abbcc56da5240d828e4cd0093cb530c945d9654b Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 17 Dec 2024 17:36:16 +0100
Subject: [PATCH 39/81] Use Adapt.jl to change storage and element type

In order to eventually support GPU computation we need
to use Adapt.jl to allow GPU backend packages to swap
out host-array types like `CuArray` with device-side types
like `CuDeviceArray`.

Additionally this will allow us to change the element type
of a simulation by using `adapt(Array{Float32}`.

Co-authored-by: Lars Christmann <account-github@l12n.eu>
Co-authored-by: Benedict Geihe <bgeihe@uni-koeln.de>
---
 .buildkite/pipeline.yml                       |   9 +-
 .github/workflows/GPUCompat.yml               |  86 -----
 Project.toml                                  |   5 +
 docs/make.jl                                  |   3 +-
 docs/src/heterogeneous.md                     |  95 +++++
 .../p4est_2d_dgsem/elixir_advection_basic.jl  |   2 +-
 .../elixir_advection_basic_gpu.jl             |  63 ++++
 ext/TrixiCUDAExt.jl                           |  11 +
 src/Trixi.jl                                  |   2 +
 src/auxiliary/containers.jl                   |  91 +++++
 src/auxiliary/vector_of_arrays.jl             |  31 ++
 src/semidiscretization/semidiscretization.jl  |  21 +-
 .../semidiscretization_hyperbolic.jl          |  27 +-
 src/solvers/dg.jl                             |   3 +
 src/solvers/dgsem/basis_lobatto_legendre.jl   |  37 ++
 src/solvers/dgsem_p4est/containers.jl         | 340 ++++++++++++++----
 .../dgsem_p4est/containers_parallel.jl        | 115 ++++--
 src/solvers/dgsem_p4est/dg_3d.jl              |   8 +-
 src/solvers/dgsem_p4est/dg_parallel.jl        |  60 ++--
 .../sort_boundary_conditions.jl               |  17 +-
 test/Project.toml                             |   2 +
 test/runtests.jl                              |   9 +
 test/test_aqua.jl                             |   1 +
 test/test_cuda.jl                             |  52 +++
 test/test_p4est_2d.jl                         |  28 ++
 test/test_unstructured_2d.jl                  |   7 +
 26 files changed, 882 insertions(+), 243 deletions(-)
 delete mode 100644 .github/workflows/GPUCompat.yml
 create mode 100644 docs/src/heterogeneous.md
 create mode 100644 examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
 create mode 100644 ext/TrixiCUDAExt.jl
 create mode 100644 src/auxiliary/vector_of_arrays.jl
 create mode 100644 test/test_cuda.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 0f8ad475db8..344b8eacc3a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,3 +1,5 @@
+env:
+
 steps:
   - label: "CUDA Julia {{matrix.version}}"
     matrix:
@@ -7,12 +9,13 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
-    command: |
-      true
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "CUDA"
     agents:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip ci\]/
     timeout_in_minutes: 60
     soft_fail:
-      - exit_status: 3
\ No newline at end of file
+      - exit_status: 3
diff --git a/.github/workflows/GPUCompat.yml b/.github/workflows/GPUCompat.yml
deleted file mode 100644
index 335e1c83c4c..00000000000
--- a/.github/workflows/GPUCompat.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: GPU Package Compatibility
-
-on:
-  pull_request:
-    paths-ignore:
-      - 'AUTHORS.md'
-      - 'CITATION.bib'
-      - 'CONTRIBUTING.md'
-      - 'LICENSE.md'
-      - 'NEWS.md'
-      - 'README.md'
-      - '.zenodo.json'
-      - '.github/workflows/benchmark.yml'
-      - '.github/workflows/CompatHelper.yml'
-      - '.github/workflows/TagBot.yml'
-      - 'benchmark/**'
-      - 'docs/**'
-      - 'utils/**'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    if: "!contains(github.event.head_commit.message, 'skip ci')"
-    name: ${{ matrix.os }} - ${{ matrix.arch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - version: '1.10'
-            os: ubuntu-latest
-            arch: x64
-          - version: '1.10'
-            os: windows-latest
-            arch: x64
-          # CUDA.jl only supports 64-bit Linux and Windows, see https://github.com/JuliaGPU/CUDA.jl?tab=readme-ov-file#requirements
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Julia
-        uses: julia-actions/setup-julia@v2
-        with:
-          version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
-
-      - name: Display version info
-        run: julia -e 'using InteractiveUtils; versioninfo(verbose=true)'
-
-      - name: Cache Julia packages
-        uses: julia-actions/cache@v2
-
-      - name: Build project
-        uses: julia-actions/julia-buildpkg@v1
-
-      # Only CUDA.jl is needed for GPU compatibility test now
-      - name: Add CUDA.jl to environment
-        run: |
-          julia --project=. -e '
-          using Pkg; 
-          Pkg.activate(temp=true);
-          Pkg.develop(PackageSpec(path=pwd())); 
-          Pkg.add("CUDA");
-          Pkg.update()'
-
-    #   - name: Add Metal.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("Metal");
-    #       Pkg.update()'
-
-    #   - name: Add AMDGPU.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("AMDGPU");
-    #       Pkg.update()'
diff --git a/Project.toml b/Project.toml
index 60443f419e7..875d2ae6db1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.12.6-DEV"
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -56,14 +57,18 @@ Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
+TrixiCUDAExt = "CUDA"
 
 [compat]
 Accessors = "0.1.36"
+Adapt = "4"
+CUDA = "5.8"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
diff --git a/docs/make.jl b/docs/make.jl
index 7111b66ab94..0301f5ba64e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -163,7 +163,8 @@ makedocs(
                  "Style guide" => "styleguide.md",
                  "Testing" => "testing.md",
                  "Performance" => "performance.md",
-                 "Parallelization" => "parallelization.md"
+                 "Parallelization" => "parallelization.md",
+                 "Heterogeneous" => "heterogeneous.md"
              ],
              "Troubleshooting and FAQ" => "troubleshooting.md",
              "Reference" => [
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
new file mode 100644
index 00000000000..b4027abdd3a
--- /dev/null
+++ b/docs/src/heterogeneous.md
@@ -0,0 +1,95 @@
+# Heterogeneous computing
+
+Support for heterogeneous computing is currently being worked on.
+
+## The use of Adapt.jl
+
+[Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) is a package in the
+[JuliaGPU](https://github.com/JuliaGPU) family that allows for
+the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
+at the storage leaves with a GPU array like `CuArray` from [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
+
+To facilitate this data structures must be parameterized, so instead of:
+
+```julia
+struct Container <: Trixi.AbstractContainer
+   data::Array{Float64, 2}
+end
+```
+
+They must be written as:
+
+```julia
+struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+   data::D
+end
+```
+
+furthermore, we need to define a function that allows for the conversion of storage
+of our types: 
+
+```julia
+function Adapt.adapt_structure(to, C::Container)
+    return Container(adapt(to, C.data))
+end
+```
+
+or simply
+
+```julia
+Adapt.@adapt_structure(Container)
+```
+
+additionally, we must define `Adapt.parent_type`.
+
+```julia
+function Adapt.parent_type(::Type{<:Container{D}}) where D
+    return D
+end
+```
+
+All together we can use this machinery to perform conversions of a container.
+
+```jldoctest
+julia> import Trixi, Adapt
+
+julia> struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+           data::D
+       end
+
+julia> Adapt.@adapt_structure(Container)
+
+julia> Adapt.parent_type(::Type{<:Container{D}}) where D = D
+
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+Array
+
+julia> using CUDA
+
+julia> GPU_C = adapt(CuArray, C)
+Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+CuArray
+```
+
+## Element-type conversion with `Trixi.trixi_adapt`.
+
+We can use [`Trixi.trixi_adapt`](@ref) to perform both an element-type and a storage-type adoption
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(Array, Float32, C)
+Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(CuArray, Float32, C)
+Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0])
+```
+
+!!! note
+    `adapt(Array{Float32}, C)` is tempting but will do the wrong thing in the presence of `StaticArrays`.
\ No newline at end of file
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic.jl b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
index 4ff646365aa..e162e8997f2 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
@@ -31,7 +31,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0))
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
new file mode 100644
index 00000000000..4c0f5744a88
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,63 @@
+# The same setup as tree_2d_dgsem/elixir_advection_basic.jl
+# to verify the StructuredMesh implementation against TreeMesh
+
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
+
+trees_per_dimension = (8, 8)
+
+# Create P4estMesh with 8 x 8 trees and 16 x 16 elements
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# TODO: Currently we can only construct the ODE problem on the GPU, but we cannot solve it on the GPU yet.
+#       Uncomment the calls below to discover missing functionality.
+
+# # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+# sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+#             dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+#             ode_default_options()..., callback = callbacks);
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
new file mode 100644
index 00000000000..681d2f53a1e
--- /dev/null
+++ b/ext/TrixiCUDAExt.jl
@@ -0,0 +1,11 @@
+# Package extension for adding CUDA-based features to Trixi.jl
+module TrixiCUDAExt
+
+import CUDA: CuArray
+import Trixi
+
+function Trixi.storage_type(::Type{<:CuArray})
+    return CuArray
+end
+
+end
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a707437655e..a52dfd6d973 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -50,6 +50,7 @@ import SciMLBase: get_du, get_tmp_cache, u_modified!,
 
 using DelimitedFiles: readdlm
 using Downloads: Downloads
+using Adapt: Adapt, adapt
 using CodeTracking: CodeTracking
 using ConstructionBase: ConstructionBase
 using DiffEqBase: DiffEqBase, get_tstops, get_tstops_array
@@ -132,6 +133,7 @@ include("basic_types.jl")
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("auxiliary/vector_of_arrays.jl")
 include("auxiliary/mpi.jl")
 include("auxiliary/p4est.jl")
 include("auxiliary/t8code.jl")
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 90650f6abcf..edc42db382b 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -314,4 +314,95 @@ end
 function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
     raw_copy!(c, c, from, from, destination)
 end
+
+# Trixi storage types must implement these two Adapt.jl methods
+function Adapt.adapt_structure(to, c::AbstractContainer)
+    error("Interface: Must implement Adapt.adapt_structure(to, ::$(typeof(c)))")
+end
+
+function Adapt.parent_type(C::Type{<:AbstractContainer})
+    error("Interface: Must implement Adapt.parent_type(::Type{$C}")
+end
+
+function Adapt.unwrap_type(C::Type{<:AbstractContainer})
+    return Adapt.unwrap_type(Adapt.parent_type(C))
+end
+
+# TODO: Upstream to Adapt
+function storage_type(x)
+    return storage_type(typeof(x))
+end
+
+function storage_type(T::Type)
+    error("Interface: Must implement storage_type(::Type{$T}")
+end
+
+function storage_type(::Type{<:Array})
+    Array
+end
+
+function storage_type(C::Type{<:AbstractContainer})
+    return storage_type(Adapt.unwrap_type(C))
+end
+
+# For some storage backends like CUDA.jl, empty arrays do seem to simply be
+# null pointers which can cause `unsafe_wrap` to fail when calling
+# Adapt.adapt (ArgumentError, see
+# https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/src/array.jl#L212-L229).
+# To circumvent this, on length zero arrays this allocates
+# a separate empty array instead of wrapping.
+# However, since zero length arrays are not used in calculations,
+# it should be okay if the underlying storage vectors and wrapped arrays
+# are not the same as long as they are properly wrapped when `resize!`d etc.
+function unsafe_wrap_or_alloc(to, vector, size)
+    if length(vector) == 0
+        return similar(vector, size)
+    else
+        return unsafe_wrap(to, pointer(vector), size)
+    end
+end
+
+struct TrixiAdaptor{Storage, Real} end
+
+function trixi_adapt(storage, real, x)
+    adapt(TrixiAdaptor{storage, real}(), x)
+end
+
+# Custom rules
+# 1. handling of StaticArrays
+function Adapt.adapt_storage(::TrixiAdaptor{<:Any, Real},
+                             x::StaticArrays.StaticArray{S, T, N}) where {Real, S, T, N}
+    StaticArrays.similar_type(x, Real)(x)
+end
+
+# 2. Handling of Arrays
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: AbstractFloat}
+    adapt(Storage{Real}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: StaticArrays.StaticArray}
+    adapt(Storage{StaticArrays.similar_type(T, Real)}, x)
+end
+
+# Our threaded cache contains MArray, it is unlikely that we would want to adapt those
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::Array{T}) where {Storage, Real,
+                                                 T <: StaticArrays.MArray}
+    adapt(Array{StaticArrays.similar_type(T, Real)}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray) where {Storage, Real}
+    adapt(Storage, x)
+end
+
+# 3. TODO: Should we have a fallback? But that would imply implementing things for NamedTuple again
+
+function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
+    return unsafe_wrap_or_alloc(Storage, vec, size)
+end
 end # @muladd
diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl
new file mode 100644
index 00000000000..0fa8dd7f1ec
--- /dev/null
+++ b/src/auxiliary/vector_of_arrays.jl
@@ -0,0 +1,31 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector.
+# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is
+# not possible for a plain Vector of Arrays.
+struct VecOfArrays{T <: AbstractArray}
+    arrays::Vector{T}
+end
+Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i)
+Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays)
+Base.size(v::VecOfArrays) = Base.size(v.arrays)
+Base.length(v::VecOfArrays) = Base.length(v.arrays)
+Base.eltype(v::VecOfArrays{T}) where {T} = T
+function Adapt.adapt_structure(to, v::VecOfArrays)
+    return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays])
+end
+function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T}
+    return T
+end
+function Adapt.unwrap_type(A::Type{<:VecOfArrays})
+    Adapt.unwrap_type(Adapt.parent_type(A))
+end
+function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray})
+    VecOfArrays(v)
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index cc3900d42da..97c50aa46a1 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -82,9 +82,15 @@ end
 
 Wrap the semidiscretization `semi` as an ODE problem in the time interval `tspan`
 that can be passed to `solve` from the [SciML ecosystem](https://diffeq.sciml.ai/latest/).
+
+The optional keyword arguments `storage_type` and `real_type` configure the underlying computational
+datastructures. `storage_type` changes the fundamental array type being used, allowing the
+experimental use of `CuArray` or other GPU array types. `real_type` changes the computational data type being used.
 """
 function semidiscretize(semi::AbstractSemidiscretization, tspan;
-                        reset_threads = true)
+                        reset_threads = true,
+                        storage_type = nothing,
+                        real_type = nothing)
     # Optionally reset Polyester.jl threads. See
     # https://github.com/trixi-framework/Trixi.jl/issues/1583
     # https://github.com/JuliaSIMD/Polyester.jl/issues/30
@@ -92,6 +98,19 @@ function semidiscretize(semi::AbstractSemidiscretization, tspan;
         Polyester.reset_threads!()
     end
 
+    if !(storage_type === nothing && real_type === nothing)
+        if storage_type === nothing
+            storage_type = Array
+        end
+        if real_type === nothing
+            real_type = Float64
+        end
+        semi = trixi_adapt(storage_type, real_type, semi)
+        if eltype(tspan) !== real_type
+            tspan = convert.(real_type, tspan)
+        end
+    end
+
     u0_ode = compute_coefficients(first(tspan), semi)
     # TODO: MPI, do we want to synchronize loading and print debug statements, e.g. using
     #       mpi_isparallel() && MPI.Barrier(mpi_comm())
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index 7496a345661..2a563c02229 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -27,25 +27,6 @@ mutable struct SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
     solver::Solver
     cache::Cache
     performance_counter::PerformanceCounter
-
-    function SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
-                                          BoundaryConditions, SourceTerms, Solver,
-                                          Cache}(mesh::Mesh, equations::Equations,
-                                                 initial_condition::InitialCondition,
-                                                 boundary_conditions::BoundaryConditions,
-                                                 source_terms::SourceTerms,
-                                                 solver::Solver,
-                                                 cache::Cache) where {Mesh, Equations,
-                                                                      InitialCondition,
-                                                                      BoundaryConditions,
-                                                                      SourceTerms,
-                                                                      Solver,
-                                                                      Cache}
-        performance_counter = PerformanceCounter()
-
-        new(mesh, equations, initial_condition, boundary_conditions, source_terms,
-            solver, cache, performance_counter)
-    end
 end
 
 """
@@ -71,6 +52,8 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
 
     check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)
 
+    performance_counter = PerformanceCounter()
+
     SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
                                  typeof(initial_condition),
                                  typeof(_boundary_conditions), typeof(source_terms),
@@ -78,9 +61,13 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
                                                                 initial_condition,
                                                                 _boundary_conditions,
                                                                 source_terms, solver,
-                                                                cache)
+                                                                cache,
+                                                                performance_counter)
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(SemidiscretizationHyperbolic)
+
 # Create a new semidiscretization but change some parameters compared to the input.
 # `Base.similar` follows a related concept but would require us to `copy` the `mesh`,
 # which would impact the performance. Instead, `SciMLBase.remake` has exactly the
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index ad211b3c003..78f3901a346 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -415,6 +415,9 @@ struct DG{Basis, Mortar, SurfaceIntegral, VolumeIntegral}
     volume_integral::VolumeIntegral
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(DG)
+
 function Base.show(io::IO, dg::DG)
     @nospecialize dg # reduce precompilation time
 
diff --git a/src/solvers/dgsem/basis_lobatto_legendre.jl b/src/solvers/dgsem/basis_lobatto_legendre.jl
index 777348aa8ce..9647f172e20 100644
--- a/src/solvers/dgsem/basis_lobatto_legendre.jl
+++ b/src/solvers/dgsem/basis_lobatto_legendre.jl
@@ -34,6 +34,32 @@ struct LobattoLegendreBasis{RealT <: Real, NNODES,
     # negative adjoint wrt the SBP dot product
 end
 
+function Adapt.adapt_structure(to, basis::LobattoLegendreBasis)
+    inverse_vandermonde_legendre = adapt(to, basis.inverse_vandermonde_legendre)
+    RealT = eltype(inverse_vandermonde_legendre)
+
+    nodes = SVector{<:Any, RealT}(basis.nodes)
+    weights = SVector{<:Any, RealT}(basis.weights)
+    inverse_weights = SVector{<:Any, RealT}(basis.inverse_weights)
+    boundary_interpolation = adapt(to, basis.boundary_interpolation)
+    derivative_matrix = adapt(to, basis.derivative_matrix)
+    derivative_split = adapt(to, basis.derivative_split)
+    derivative_split_transpose = adapt(to, basis.derivative_split_transpose)
+    derivative_dhat = adapt(to, basis.derivative_dhat)
+    return LobattoLegendreBasis{RealT, nnodes(basis), typeof(nodes),
+                                typeof(inverse_vandermonde_legendre),
+                                typeof(boundary_interpolation),
+                                typeof(derivative_matrix)}(nodes,
+                                                           weights,
+                                                           inverse_weights,
+                                                           inverse_vandermonde_legendre,
+                                                           boundary_interpolation,
+                                                           derivative_matrix,
+                                                           derivative_split,
+                                                           derivative_split_transpose,
+                                                           derivative_dhat)
+end
+
 function LobattoLegendreBasis(RealT, polydeg::Integer)
     nnodes_ = polydeg + 1
 
@@ -155,6 +181,17 @@ struct LobattoLegendreMortarL2{RealT <: Real, NNODES,
     reverse_lower::ReverseMatrix
 end
 
+function Adapt.adapt_structure(to, mortar::LobattoLegendreMortarL2)
+    forward_upper = adapt(to, mortar.forward_upper)
+    forward_lower = adapt(to, mortar.forward_lower)
+    reverse_upper = adapt(to, mortar.reverse_upper)
+    reverse_lower = adapt(to, mortar.reverse_lower)
+    return LobattoLegendreMortarL2{eltype(forward_upper), nnodes(mortar),
+                                   typeof(forward_upper),
+                                   typeof(reverse_upper)}(forward_upper, forward_lower,
+                                                          reverse_upper, reverse_lower)
+end
+
 function MortarL2(basis::LobattoLegendreBasis)
     RealT = real(basis)
     nnodes_ = nnodes(basis)
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index a070db6b701..83097f4a1ed 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -6,25 +6,31 @@
 #! format: noindent
 
 mutable struct P4estElementContainer{NDIMS, RealT <: Real, uEltype <: Real, NDIMSP1,
-                                     NDIMSP2, NDIMSP3} <: AbstractContainer
+                                     NDIMSP2, NDIMSP3,
+                                     ArrayNDIMSP1 <: DenseArray{RealT, NDIMSP1},
+                                     ArrayNDIMSP2 <: DenseArray{RealT, NDIMSP2},
+                                     ArrayNDIMSP3 <: DenseArray{RealT, NDIMSP3},
+                                     VectorRealT <: DenseVector{RealT},
+                                     VectoruEltype <: DenseVector{uEltype}} <:
+               AbstractContainer
     # Physical coordinates at each node
-    node_coordinates::Array{RealT, NDIMSP2}   # [orientation, node_i, node_j, node_k, element]
+    node_coordinates::ArrayNDIMSP2   # [orientation, node_i, node_j, node_k, element]
     # Jacobian matrix of the transformation
     # [jacobian_i, jacobian_j, node_i, node_j, node_k, element] where jacobian_i is the first index of the Jacobian matrix,...
-    jacobian_matrix::Array{RealT, NDIMSP3}
+    jacobian_matrix::ArrayNDIMSP3
     # Contravariant vectors, scaled by J, in Kopriva's blue book called Ja^i_n (i index, n dimension)
-    contravariant_vectors::Array{RealT, NDIMSP3}   # [dimension, index, node_i, node_j, node_k, element]
+    contravariant_vectors::ArrayNDIMSP3   # [dimension, index, node_i, node_j, node_k, element]
     # 1/J where J is the Jacobian determinant (determinant of Jacobian matrix)
-    inverse_jacobian::Array{RealT, NDIMSP1}   # [node_i, node_j, node_k, element]
+    inverse_jacobian::ArrayNDIMSP1   # [node_i, node_j, node_k, element]
     # Buffer for calculated surface flux
-    surface_flux_values::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element]
+    surface_flux_values::ArrayNDIMSP2 # [variable, i, j, direction, element]
 
     # internal `resize!`able storage
-    _node_coordinates::Vector{RealT}
-    _jacobian_matrix::Vector{RealT}
-    _contravariant_vectors::Vector{RealT}
-    _inverse_jacobian::Vector{RealT}
-    _surface_flux_values::Vector{uEltype}
+    _node_coordinates::VectorRealT
+    _jacobian_matrix::VectorRealT
+    _contravariant_vectors::VectorRealT
+    _inverse_jacobian::VectorRealT
+    _surface_flux_values::VectoruEltype
 end
 
 @inline function nelements(elements::P4estElementContainer)
@@ -36,7 +42,7 @@ end
                                                                                     RealT,
                                                                                     uEltype
                                                                                     }
-    uEltype
+    return uEltype
 end
 
 # Only one-dimensional `Array`s are `resize!`able in Julia.
@@ -51,31 +57,41 @@ function Base.resize!(elements::P4estElementContainer, capacity)
     n_dims = ndims(elements)
     n_nodes = size(elements.node_coordinates, 2)
     n_variables = size(elements.surface_flux_values, 1)
+    ArrayType = storage_type(elements)
 
     resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity)
-    elements.node_coordinates = unsafe_wrap(Array, pointer(_node_coordinates),
-                                            (n_dims, ntuple(_ -> n_nodes, n_dims)...,
-                                             capacity))
+    elements.node_coordinates = unsafe_wrap_or_alloc(ArrayType,
+                                                     _node_coordinates,
+                                                     (n_dims,
+                                                      ntuple(_ -> n_nodes, n_dims)...,
+                                                      capacity))
 
     resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity)
-    elements.jacobian_matrix = unsafe_wrap(Array, pointer(_jacobian_matrix),
-                                           (n_dims, n_dims,
-                                            ntuple(_ -> n_nodes, n_dims)..., capacity))
+    elements.jacobian_matrix = unsafe_wrap_or_alloc(ArrayType,
+                                                    _jacobian_matrix,
+                                                    (n_dims, n_dims,
+                                                     ntuple(_ -> n_nodes, n_dims)...,
+                                                     capacity))
 
     resize!(_contravariant_vectors, length(_jacobian_matrix))
-    elements.contravariant_vectors = unsafe_wrap(Array, pointer(_contravariant_vectors),
-                                                 size(elements.jacobian_matrix))
+    elements.contravariant_vectors = unsafe_wrap_or_alloc(ArrayType,
+                                                          _contravariant_vectors,
+                                                          size(elements.jacobian_matrix))
 
     resize!(_inverse_jacobian, n_nodes^n_dims * capacity)
-    elements.inverse_jacobian = unsafe_wrap(Array, pointer(_inverse_jacobian),
-                                            (ntuple(_ -> n_nodes, n_dims)..., capacity))
+    elements.inverse_jacobian = unsafe_wrap_or_alloc(ArrayType,
+                                                     _inverse_jacobian,
+                                                     (ntuple(_ -> n_nodes, n_dims)...,
+                                                      capacity))
 
     resize!(_surface_flux_values,
             n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity)
-    elements.surface_flux_values = unsafe_wrap(Array, pointer(_surface_flux_values),
-                                               (n_variables,
-                                                ntuple(_ -> n_nodes, n_dims - 1)...,
-                                                n_dims * 2, capacity))
+    elements.surface_flux_values = unsafe_wrap_or_alloc(ArrayType,
+                                                        _surface_flux_values,
+                                                        (n_variables,
+                                                         ntuple(_ -> n_nodes,
+                                                                n_dims - 1)...,
+                                                         n_dims * 2, capacity))
 
     return nothing
 end
@@ -117,33 +133,104 @@ function init_elements(mesh::Union{P4estMesh{NDIMS, NDIMS, RealT},
                                        NDIMS * 2, nelements))
 
     elements = P4estElementContainer{NDIMS, RealT, uEltype, NDIMS + 1, NDIMS + 2,
-                                     NDIMS + 3}(node_coordinates, jacobian_matrix,
-                                                contravariant_vectors,
-                                                inverse_jacobian, surface_flux_values,
-                                                _node_coordinates, _jacobian_matrix,
-                                                _contravariant_vectors,
-                                                _inverse_jacobian, _surface_flux_values)
+                                     NDIMS + 3, Array{RealT, NDIMS + 1},
+                                     Array{RealT, NDIMS + 2}, Array{RealT, NDIMS + 3},
+                                     Vector{RealT}, Vector{uEltype}}(node_coordinates,
+                                                                     jacobian_matrix,
+                                                                     contravariant_vectors,
+                                                                     inverse_jacobian,
+                                                                     surface_flux_values,
+                                                                     _node_coordinates,
+                                                                     _jacobian_matrix,
+                                                                     _contravariant_vectors,
+                                                                     _inverse_jacobian,
+                                                                     _surface_flux_values)
 
     init_elements!(elements, mesh, basis)
     return elements
 end
 
-mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+function Adapt.parent_type(::Type{<:P4estElementContainer{<:Any, <:Any, <:Any, <:Any,
+                                                          <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to,
+                               elements::P4estElementContainer{NDIMS}) where {NDIMS}
+    # Adapt underlying storage
+    _node_coordinates = adapt(to, elements._node_coordinates)
+    _jacobian_matrix = adapt(to, elements._jacobian_matrix)
+    _contravariant_vectors = adapt(to, elements._contravariant_vectors)
+    _inverse_jacobian = adapt(to, elements._inverse_jacobian)
+    _surface_flux_values = adapt(to, elements._surface_flux_values)
+
+    RealT = eltype(_inverse_jacobian)
+    uEltype = eltype(_surface_flux_values)
+
+    # Wrap arrays again
+    node_coordinates = unsafe_wrap_or_alloc(to, _node_coordinates,
+                                            size(elements.node_coordinates))
+    jacobian_matrix = unsafe_wrap_or_alloc(to, _jacobian_matrix,
+                                           size(elements.jacobian_matrix))
+    contravariant_vectors = unsafe_wrap_or_alloc(to, _contravariant_vectors,
+                                                 size(jacobian_matrix))
+    inverse_jacobian = unsafe_wrap_or_alloc(to, _inverse_jacobian,
+                                            size(elements.inverse_jacobian))
+    surface_flux_values = unsafe_wrap_or_alloc(to, _surface_flux_values,
+                                               size(elements.surface_flux_values))
+
+    new_type_params = (NDIMS,
+                       RealT,
+                       uEltype,
+                       NDIMS + 1,
+                       NDIMS + 2,
+                       NDIMS + 3,
+                       typeof(inverse_jacobian), # ArrayNDIMSP1
+                       typeof(node_coordinates), # ArrayNDIMSP2
+                       typeof(jacobian_matrix), # ArrayNDIMSP3
+                       typeof(_node_coordinates), # VectorRealT
+                       typeof(_surface_flux_values)) # VectoruEltype
+    return P4estElementContainer{new_type_params...}(node_coordinates,
+                                                     jacobian_matrix,
+                                                     contravariant_vectors,
+                                                     inverse_jacobian,
+                                                     surface_flux_values,
+                                                     _node_coordinates,
+                                                     _jacobian_matrix,
+                                                     _contravariant_vectors,
+                                                     _inverse_jacobian,
+                                                     _surface_flux_values)
+end
+
+mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                       uArray <: DenseArray{uEltype, NDIMSP2},
+                                       IdsMatrix <: DenseMatrix{Int},
+                                       IndicesMatrix <:
+                                       DenseMatrix{NTuple{NDIMS, Symbol}},
+                                       uVector <: DenseVector{uEltype},
+                                       IdsVector <: DenseVector{Int},
+                                       IndicesVector <:
+                                       DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}       # [primary/secondary, variable, i, j, interface]
-    neighbor_ids::Matrix{Int}                   # [primary/secondary, interface]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [primary/secondary, interface]
+    u::uArray       # [primary/secondary, variable, i, j, interface]
+    neighbor_ids::IdsMatrix                   # [primary/secondary, interface]
+    node_indices::IndicesMatrix # [primary/secondary, interface]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline function ninterfaces(interfaces::P4estInterfaceContainer)
     size(interfaces.neighbor_ids, 2)
 end
 @inline Base.ndims(::P4estInterfaceContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estInterfaceContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                               uEltype}
+    return uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
@@ -152,17 +239,20 @@ function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
     n_dims = ndims(interfaces)
     n_nodes = size(interfaces.u, 3)
     n_variables = size(interfaces.u, 2)
+    ArrayType = storage_type(interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    interfaces.u = unsafe_wrap(Array, pointer(_u),
+    interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
     resize!(_neighbor_ids, 2 * capacity)
-    interfaces.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids), (2, capacity))
+    interfaces.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
+                                          (2, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    interfaces.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    interfaces.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices),
+                                          (2, capacity))
 
     return nothing
 end
@@ -189,10 +279,15 @@ function init_interfaces(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_interfaces)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_interfaces))
 
-    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u, neighbor_ids,
-                                                                    node_indices,
-                                                                    _u, _neighbor_ids,
-                                                                    _node_indices)
+    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                         typeof(u), typeof(neighbor_ids),
+                                         typeof(node_indices), typeof(_u),
+                                         typeof(_neighbor_ids), typeof(_node_indices)}(u,
+                                                                                       neighbor_ids,
+                                                                                       node_indices,
+                                                                                       _u,
+                                                                                       _neighbor_ids,
+                                                                                       _node_indices)
 
     init_interfaces!(interfaces, mesh)
 
@@ -205,21 +300,58 @@ function init_interfaces!(interfaces, mesh::Union{P4estMesh, P4estMeshView})
     return interfaces
 end
 
-mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1} <:
+function Adapt.parent_type(::Type{<:P4estInterfaceContainer{<:Any, <:Any, <:Any,
+                                                            ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, interfaces::P4estInterfaceContainer)
+    # Adapt underlying storage
+    _u = adapt(to, interfaces._u)
+    _neighbor_ids = adapt(to, interfaces._neighbor_ids)
+    _node_indices = adapt(to, interfaces._node_indices)
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(interfaces.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids,
+                                        size(interfaces.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices,
+                                        size(interfaces.node_indices))
+
+    NDIMS = ndims(interfaces)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 2,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estInterfaceContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                       _u, _neighbor_ids, _node_indices)
+end
+
+mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1,
+                                      uArray <: DenseArray{uEltype, NDIMSP1},
+                                      IdsVector <: DenseVector{Int},
+                                      IndicesVector <:
+                                      DenseVector{NTuple{NDIMS, Symbol}},
+                                      uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP1}       # [variables, i, j, boundary]
-    neighbor_ids::Vector{Int}                   # [boundary]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [boundary]
+    u::uArray       # [variables, i, j, boundary]
+    neighbor_ids::IdsVector                 # [boundary]
+    node_indices::IndicesVector # [boundary]
     name::Vector{Symbol}                # [boundary]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nboundaries(boundaries::P4estBoundaryContainer)
     length(boundaries.neighbor_ids)
 end
 @inline Base.ndims(::P4estBoundaryContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estBoundaryContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                              uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
@@ -228,9 +360,10 @@ function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
     n_dims = ndims(boundaries)
     n_nodes = size(boundaries.u, 2)
     n_variables = size(boundaries.u, 1)
+    ArrayType = storage_type(boundaries)
 
     resize!(_u, n_variables * n_nodes^(n_dims - 1) * capacity)
-    boundaries.u = unsafe_wrap(Array, pointer(_u),
+    boundaries.u = unsafe_wrap(ArrayType, pointer(_u),
                                (n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
@@ -263,9 +396,11 @@ function init_boundaries(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, n_boundaries)
     names = Vector{Symbol}(undef, n_boundaries)
 
-    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1}(u, neighbor_ids,
-                                                                   node_indices, names,
-                                                                   _u)
+    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1, typeof(u),
+                                        typeof(neighbor_ids), typeof(node_indices),
+                                        typeof(_u)}(u, neighbor_ids,
+                                                    node_indices, names,
+                                                    _u)
 
     if n_boundaries > 0
         init_boundaries!(boundaries, mesh)
@@ -312,6 +447,25 @@ function init_boundaries_iter_face_inner(info_pw, boundaries, boundary_id, mesh)
     return nothing
 end
 
+function Adapt.parent_type(::Type{<:P4estBoundaryContainer{<:Any, <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, boundaries::P4estBoundaryContainer)
+    _u = adapt(to, boundaries._u)
+    u = unsafe_wrap_or_alloc(to, _u, size(boundaries.u))
+    neighbor_ids = adapt(to, boundaries.neighbor_ids)
+    node_indices = adapt(to, boundaries.node_indices)
+    name = boundaries.name
+
+    NDIMS = ndims(boundaries)
+    return P4estBoundaryContainer{NDIMS, eltype(_u), NDIMS + 1, typeof(u),
+                                  typeof(neighbor_ids), typeof(node_indices),
+                                  typeof(_u)}(u, neighbor_ids, node_indices,
+                                              name, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # The positions used in `neighbor_ids` are 1:3 (in 2D) or 1:5 (in 3D), where 1:2 (in 2D)
@@ -337,20 +491,32 @@ end
 # │ └─────────────┴─────────────┘  └───────────────────────────┘
 # │
 # ⋅────> ξ
-mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3} <:
+mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3,
+                                    uArray <: DenseArray{uEltype, NDIMSP3},
+                                    IdsMatrix <: DenseMatrix{Int},
+                                    IndicesMatrix <:
+                                    DenseMatrix{NTuple{NDIMS, Symbol}},
+                                    uVector <: DenseVector{uEltype},
+                                    IdsVector <: DenseVector{Int},
+                                    IndicesVector <:
+                                    DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP3} # [small/large side, variable, position, i, j, mortar]
-    neighbor_ids::Matrix{Int}             # [position, mortar]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    u::uArray # [small/large side, variable, position, i, j, mortar]
+    neighbor_ids::IdsMatrix # [position, mortar]
+    node_indices::IndicesMatrix # [small/large, mortar]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline nmortars(mortars::P4estMortarContainer) = size(mortars.neighbor_ids, 2)
 @inline Base.ndims(::P4estMortarContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estMortarContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                            uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(mortars::P4estMortarContainer, capacity)
@@ -359,18 +525,19 @@ function Base.resize!(mortars::P4estMortarContainer, capacity)
     n_dims = ndims(mortars)
     n_nodes = size(mortars.u, 4)
     n_variables = size(mortars.u, 2)
+    ArrayType = storage_type(mortars)
 
     resize!(_u, 2 * n_variables * 2^(n_dims - 1) * n_nodes^(n_dims - 1) * capacity)
-    mortars.u = unsafe_wrap(Array, pointer(_u),
+    mortars.u = unsafe_wrap(ArrayType, pointer(_u),
                             (2, n_variables, 2^(n_dims - 1),
                              ntuple(_ -> n_nodes, n_dims - 1)..., capacity))
 
     resize!(_neighbor_ids, (2^(n_dims - 1) + 1) * capacity)
-    mortars.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids),
+    mortars.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
                                        (2^(n_dims - 1) + 1, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    mortars.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    mortars.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices), (2, capacity))
 
     return nothing
 end
@@ -398,12 +565,15 @@ function init_mortars(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equatio
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_mortars)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_mortars))
 
-    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3}(u,
-                                                                         neighbor_ids,
-                                                                         node_indices,
-                                                                         _u,
-                                                                         _neighbor_ids,
-                                                                         _node_indices)
+    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3, typeof(u),
+                                   typeof(neighbor_ids), typeof(node_indices),
+                                   typeof(_u), typeof(_neighbor_ids),
+                                   typeof(_node_indices)}(u,
+                                                          neighbor_ids,
+                                                          node_indices,
+                                                          _u,
+                                                          _neighbor_ids,
+                                                          _node_indices)
 
     if n_mortars > 0
         init_mortars!(mortars, mesh)
@@ -418,6 +588,34 @@ function init_mortars!(mortars, mesh::Union{P4estMesh, P4estMeshView})
     return mortars
 end
 
+function Adapt.parent_type(::Type{<:P4estMortarContainer{<:Any, <:Any, <:Any, <:Any,
+                                                         ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mortars::P4estMortarContainer)
+    # Adapt underlying storage
+    _u = adapt(to, mortars._u)
+    _neighbor_ids = adapt(to, mortars._neighbor_ids)
+    _node_indices = adapt(to, mortars._node_indices)
+
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(mortars.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids, size(mortars.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices, size(mortars.node_indices))
+
+    NDIMS = ndims(mortars)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 1,
+                       NDIMS + 3,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estMortarContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                    _u, _neighbor_ids, _node_indices)
+end
+
 function reinitialize_containers!(mesh::P4estMesh, equations, dg::DGSEM, cache)
     # Re-initialize elements container
     @unpack elements = cache
diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl
index 676b37efff3..123337d8c0a 100644
--- a/src/solvers/dgsem_p4est/containers_parallel.jl
+++ b/src/solvers/dgsem_p4est/containers_parallel.jl
@@ -5,15 +5,19 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                          uArray <: DenseArray{uEltype, NDIMSP2},
+                                          VecInt <: DenseVector{Int},
+                                          IndicesVector <:
+                                          DenseVector{NTuple{NDIMS, Symbol}},
+                                          uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}                  # [primary/secondary, variable, i, j, interface]
-    local_neighbor_ids::Vector{Int}             # [interface]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [interface]
-    local_sides::Vector{Int}                    # [interface]
-
+    u::uArray                   # [primary/secondary, variable, i, j, interface]
+    local_neighbor_ids::VecInt  # [interface]
+    node_indices::IndicesVector # [interface]
+    local_sides::VecInt         # [interface]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nmpiinterfaces(interfaces::P4estMPIInterfaceContainer)
@@ -27,9 +31,10 @@ function Base.resize!(mpi_interfaces::P4estMPIInterfaceContainer, capacity)
     n_dims = ndims(mpi_interfaces)
     n_nodes = size(mpi_interfaces.u, 3)
     n_variables = size(mpi_interfaces.u, 2)
+    ArrayType = storage_type(mpi_interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    mpi_interfaces.u = unsafe_wrap(Array, pointer(_u),
+    mpi_interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                    (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                     capacity))
 
@@ -64,11 +69,13 @@ function init_mpi_interfaces(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh},
 
     local_sides = Vector{Int}(undef, n_mpi_interfaces)
 
-    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u,
-                                                                           local_neighbor_ids,
-                                                                           node_indices,
-                                                                           local_sides,
-                                                                           _u)
+    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                                typeof(u), typeof(local_neighbor_ids),
+                                                typeof(node_indices), typeof(_u)}(u,
+                                                                                  local_neighbor_ids,
+                                                                                  node_indices,
+                                                                                  local_sides,
+                                                                                  _u)
 
     init_mpi_interfaces!(mpi_interfaces, mesh)
 
@@ -81,6 +88,32 @@ function init_mpi_interfaces!(mpi_interfaces, mesh::ParallelP4estMesh)
     return mpi_interfaces
 end
 
+function Adapt.parent_type(::Type{<:Trixi.P4estMPIInterfaceContainer{<:Any, <:Any,
+                                                                     <:Any, A}}) where {A}
+    return A
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mpi_interfaces::P4estMPIInterfaceContainer)
+    # Adapt Vectors and underlying storage
+    _u = adapt(to, mpi_interfaces._u)
+    local_neighbor_ids = adapt(to, mpi_interfaces.local_neighbor_ids)
+    node_indices = adapt(to, mpi_interfaces.node_indices)
+    local_sides = adapt(to, mpi_interfaces.local_sides)
+
+    # Wrap array again
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_interfaces.u))
+
+    NDIMS = ndims(mpi_interfaces)
+    return P4estMPIInterfaceContainer{NDIMS, eltype(u),
+                                      NDIMS + 2,
+                                      typeof(u), typeof(local_neighbor_ids),
+                                      typeof(node_indices), typeof(_u)}(u,
+                                                                        local_neighbor_ids,
+                                                                        node_indices,
+                                                                        local_sides, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # Similar to `P4estMortarContainer`. The field `neighbor_ids` has been split up into
@@ -88,14 +121,17 @@ end
 # available elements belonging to a particular MPI mortar. Furthermore, `normal_directions` holds
 # the normal vectors on the surface of the small elements for each mortar.
 mutable struct P4estMPIMortarContainer{NDIMS, uEltype <: Real, RealT <: Real, NDIMSP1,
-                                       NDIMSP2, NDIMSP3} <: AbstractContainer
-    u::Array{uEltype, NDIMSP3}                    # [small/large side, variable, position, i, j, mortar]
-    local_neighbor_ids::Vector{Vector{Int}}       # [mortar][ids]
-    local_neighbor_positions::Vector{Vector{Int}} # [mortar][positions]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}}   # [small/large, mortar]
-    normal_directions::Array{RealT, NDIMSP2}      # [dimension, i, j, position, mortar]
+                                       NDIMSP2, NDIMSP3,
+                                       uArray <: DenseArray{uEltype, NDIMSP3},
+                                       uVector <: DenseVector{uEltype}} <:
+               AbstractContainer
+    u::uArray                                      # [small/large side, variable, position, i, j, mortar]
+    local_neighbor_ids::Vector{Vector{Int}}        # [mortar][ids]
+    local_neighbor_positions::Vector{Vector{Int}}  # [mortar][positions]
+    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    normal_directions::Array{RealT, NDIMSP2}       # [dimension, i, j, position, mortar]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
     _node_indices::Vector{NTuple{NDIMS, Symbol}}
     _normal_directions::Vector{RealT}
 end
@@ -164,11 +200,12 @@ function init_mpi_mortars(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh}, eq
                                      2^(NDIMS - 1), n_mpi_mortars))
 
     mpi_mortars = P4estMPIMortarContainer{NDIMS, uEltype, RealT, NDIMS + 1, NDIMS + 2,
-                                          NDIMS + 3}(u, local_neighbor_ids,
-                                                     local_neighbor_positions,
-                                                     node_indices, normal_directions,
-                                                     _u, _node_indices,
-                                                     _normal_directions)
+                                          NDIMS + 3, typeof(u),
+                                          typeof(_u)}(u, local_neighbor_ids,
+                                                      local_neighbor_positions,
+                                                      node_indices, normal_directions,
+                                                      _u, _node_indices,
+                                                      _normal_directions)
 
     if n_mpi_mortars > 0
         init_mpi_mortars!(mpi_mortars, mesh, basis, elements)
@@ -184,6 +221,34 @@ function init_mpi_mortars!(mpi_mortars, mesh::ParallelP4estMesh, basis, elements
     return mpi_mortars
 end
 
+function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer)
+    # Only parts of this container are adapted, since we currently don't
+    # use `local_neighbor_ids`, `local_neighbor_positions`, `normal_directions`
+    # on the GPU. If we do need them we need to redesign this to use the VecOfArrays
+    # approach.
+
+    _u = adapt(to, mpi_mortars._u)
+    _node_indices = mpi_mortars._node_indices
+    _normal_directions = mpi_mortars._normal_directions
+
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_mortars.u))
+    local_neighbor_ids = mpi_mortars.local_neighbor_ids
+    local_neighbor_positions = mpi_mortars.local_neighbor_positions
+    node_indices = mpi_mortars.node_indices
+    normal_directions = mpi_mortars.normal_directions
+
+    NDIMS = ndims(mpi_mortars)
+    return P4estMPIMortarContainer{NDIMS, eltype(_u),
+                                   eltype(_normal_directions),
+                                   NDIMS + 1, NDIMS + 2, NDIMS + 3,
+                                   typeof(u), typeof(_u)}(u, local_neighbor_ids,
+                                                          local_neighbor_positions,
+                                                          node_indices,
+                                                          normal_directions, _u,
+                                                          _node_indices,
+                                                          _normal_directions)
+end
+
 # Overload init! function for regular interfaces, regular mortars and boundaries since they must
 # call the appropriate init_surfaces! function for parallel p4est meshes
 function init_interfaces!(interfaces, mesh::ParallelP4estMesh)
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index e59f502c86c..4c099c9fd3f 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -13,18 +13,18 @@ function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations,
     fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
                                                 nnodes(mortar_l2),
                                                 nnodes(mortar_l2), 4)
-                              for _ in 1:Threads.nthreads()]
+                              for _ in 1:Threads.nthreads()] |> VecOfArrays
     fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
                                                   nnodes(mortar_l2),
                                                   nnodes(mortar_l2), 4)
-                                for _ in 1:Threads.nthreads()]
+                                for _ in 1:Threads.nthreads()] |> VecOfArrays
 
     fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations),
                                             nnodes(mortar_l2), nnodes(mortar_l2))
-                          for _ in 1:Threads.nthreads()]
+                          for _ in 1:Threads.nthreads()] |> VecOfArrays
     u_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2),
                                     nnodes(mortar_l2))
-                  for _ in 1:Threads.nthreads()]
+                  for _ in 1:Threads.nthreads()] |> VecOfArrays
 
     (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded)
 end
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index 2cc201dd1f0..7acddf07b4b 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -5,12 +5,13 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPICache{uEltype}
+mutable struct P4estMPICache{BufferType <: DenseVector,
+                             VecInt <: DenseVector{<:Integer}}
     mpi_neighbor_ranks::Vector{Int}
-    mpi_neighbor_interfaces::Vector{Vector{Int}}
-    mpi_neighbor_mortars::Vector{Vector{Int}}
-    mpi_send_buffers::Vector{Vector{uEltype}}
-    mpi_recv_buffers::Vector{Vector{uEltype}}
+    mpi_neighbor_interfaces::VecOfArrays{VecInt}
+    mpi_neighbor_mortars::VecOfArrays{VecInt}
+    mpi_send_buffers::VecOfArrays{BufferType}
+    mpi_recv_buffers::VecOfArrays{BufferType}
     mpi_send_requests::Vector{MPI.Request}
     mpi_recv_requests::Vector{MPI.Request}
     n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
@@ -25,25 +26,29 @@ function P4estMPICache(uEltype)
     end
 
     mpi_neighbor_ranks = Vector{Int}(undef, 0)
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0)
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0)
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0)
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0)
+    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
+    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
     mpi_send_requests = Vector{MPI.Request}(undef, 0)
     mpi_recv_requests = Vector{MPI.Request}(undef, 0)
     n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1)
     n_elements_global = 0
     first_element_global_id = 0
 
-    P4estMPICache{uEltype}(mpi_neighbor_ranks, mpi_neighbor_interfaces,
-                           mpi_neighbor_mortars,
-                           mpi_send_buffers, mpi_recv_buffers,
-                           mpi_send_requests, mpi_recv_requests,
-                           n_elements_by_rank, n_elements_global,
-                           first_element_global_id)
+    P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks,
+                                                mpi_neighbor_interfaces,
+                                                mpi_neighbor_mortars,
+                                                mpi_send_buffers, mpi_recv_buffers,
+                                                mpi_send_requests, mpi_recv_requests,
+                                                n_elements_by_rank, n_elements_global,
+                                                first_element_global_id)
 end
 
-@inline Base.eltype(::P4estMPICache{uEltype}) where {uEltype} = uEltype
+@inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType)
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(P4estMPICache)
 
 ##
 # Note that the code in `start_mpi_send`/`finish_mpi_receive!` is sensitive to inference on (at least) Julia 1.10.
@@ -265,16 +270,16 @@ end
 
 function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
                          mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype)
-    mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
-                                                                                                       mpi_mortars,
-                                                                                                       mesh)
+    mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
+                                                                                                         mpi_mortars,
+                                                                                                         mesh)
 
-    mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces,
-                                                                                                        mpi_neighbor_mortars,
-                                                                                                        ndims(mesh),
-                                                                                                        nvars,
-                                                                                                        n_nodes,
-                                                                                                        uEltype)
+    _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces,
+                                                                                                          _mpi_neighbor_mortars,
+                                                                                                          ndims(mesh),
+                                                                                                          nvars,
+                                                                                                          n_nodes,
+                                                                                                          uEltype)
 
     # Determine local and total number of elements
     n_elements_global = Int(mesh.p4est.global_num_quadrants[])
@@ -286,6 +291,11 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
     first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1
     @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements"
 
+    mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces)
+    mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars)
+    mpi_send_buffers = VecOfArrays(_mpi_send_buffers)
+    mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers)
+
     # TODO reuse existing structures
     @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces,
                        mpi_neighbor_mortars,
diff --git a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
index 0cb3bd7f409..d6cf6e1ce6d 100644
--- a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
+++ b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
@@ -13,9 +13,10 @@ It stores a set of global indices for each boundary condition type and name to e
 during the call to `calc_boundary_flux!`. The original dictionary form of the boundary conditions
 set by the user in the elixir file is also stored for printing.
 """
-mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any}}
+mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any},
+                                               Vec <: AbstractVector{<:Integer}}
     boundary_condition_types::BCs # specific boundary condition type(s), e.g. BoundaryConditionDirichlet
-    boundary_indices::NTuple{N, Vector{Int}} # integer vectors containing global boundary indices
+    boundary_indices::NTuple{N, Vec} # integer vectors containing global boundary indices
     boundary_dictionary::Dict{Symbol, Any} # boundary conditions as set by the user in the elixir file
     boundary_symbol_indices::Dict{Symbol, Vector{Int}} # integer vectors containing global boundary indices per boundary identifier
 end
@@ -33,10 +34,11 @@ function UnstructuredSortedBoundaryTypes(boundary_conditions::Dict, cache)
     boundary_symbol_indices = Dict{Symbol, Vector{Int}}()
 
     container = UnstructuredSortedBoundaryTypes{n_boundary_types,
-                                                typeof(boundary_condition_types)}(boundary_condition_types,
-                                                                                  boundary_indices,
-                                                                                  boundary_conditions,
-                                                                                  boundary_symbol_indices)
+                                                typeof(boundary_condition_types),
+                                                Vector{Int}}(boundary_condition_types,
+                                                             boundary_indices,
+                                                             boundary_conditions,
+                                                             boundary_symbol_indices)
 
     initialize!(container, cache)
 end
@@ -119,4 +121,7 @@ function initialize!(boundary_types_container::UnstructuredSortedBoundaryTypes{N
 
     return boundary_types_container
 end
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(UnstructuredSortedBoundaryTypes)
 end # @muladd
diff --git a/test/Project.toml b/test/Project.toml
index 3559f8cb6e2..7e40da4ceae 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,8 +1,10 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
diff --git a/test/runtests.jl b/test/runtests.jl
index db2c2e9dd88..8f35e1fb58d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -109,4 +109,13 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "paper_self_gravitating_gas_dynamics"
         include("test_paper_self_gravitating_gas_dynamics.jl")
     end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
+        import CUDA
+        if CUDA.functional()
+            include("test_cuda.jl")
+        else
+            @warn "Unable to run CUDA tests on this machine"
+        end
+    end
 end
diff --git a/test/test_aqua.jl b/test/test_aqua.jl
index 9b3f2d67903..154088995ca 100644
--- a/test/test_aqua.jl
+++ b/test/test_aqua.jl
@@ -10,6 +10,7 @@ include("test_trixi.jl")
 @timed_testset "Aqua.jl" begin
     Aqua.test_all(Trixi,
                   ambiguities = false,
+                  unbound_args = false, # FIXME: UnstructuredSortedBoundaryTypes
                   # exceptions necessary for adding a new method `StartUpDG.estimate_h`
                   # in src/solvers/dgmulti/sbp.jl
                   piracies = (treat_as_own = [Trixi.StartUpDG.RefElemData,
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
new file mode 100644
index 00000000000..1f96d8c863e
--- /dev/null
+++ b/test/test_cuda.jl
@@ -0,0 +1,52 @@
+module TestCUDA
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+@trixi_testset "elixir_advection_basic_gpu.jl" begin
+    # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using CUDA
+    # TODO(benegee/vchuravy): Port compute_coefficients! to KernelAbstractions.jl
+    CUDA.allowscalar(true)
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=nothing,   # [Float32(8.311947673061856e-6)],
+                        linf=nothing, # [Float32(6.627000273229378e-5)],
+                        RealT=Float32,
+                        real_type=Float32,
+                        storage_type=CuArray)
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test_broken ode.u0 isa CuArray
+    @test ode.p.solver.basis.derivative_matrix isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === CuArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+
+end # module
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 8f903a849d2..5d17bb1654e 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -27,6 +27,34 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(semi32.mesh) == Float64
+end
+
+@trixi_testset "elixir_advection_basic.jl (Float32)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT=Float32,
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test_broken (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
 end
 
 @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
diff --git a/test/test_unstructured_2d.jl b/test/test_unstructured_2d.jl
index d16bc96fb83..758e42b7da1 100644
--- a/test/test_unstructured_2d.jl
+++ b/test/test_unstructured_2d.jl
@@ -2,6 +2,7 @@ module TestExamplesUnstructuredMesh2D
 
 using Test
 using Trixi
+using Adapt
 
 include("test_trixi.jl")
 
@@ -32,6 +33,12 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh as well
+    @test real(semi32.mesh) == Float64
 end
 
 @trixi_testset "elixir_euler_free_stream.jl" begin

From a18e5d2f8a440e8c794d4084ea3237a981cd9ad7 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 19:33:07 +0200
Subject: [PATCH 40/81] restore elixir

---
 examples/p4est_2d_dgsem/elixir_advection_basic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic.jl b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
index e162e8997f2..4ff646365aa 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic.jl
@@ -31,7 +31,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+ode = semidiscretize(semi, (0.0, 1.0))
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers

From 5c942fe351e0a16f3d367e67d0afe0e7f53094db Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 15:08:30 +0200
Subject: [PATCH 41/81] offload compute_coefficients

---
 Project.toml                                  |  2 +
 .../elixir_advection_basic_gpu.jl             | 18 ++++---
 src/Trixi.jl                                  |  1 +
 src/auxiliary/containers.jl                   |  4 ++
 src/semidiscretization/semidiscretization.jl  |  3 +-
 src/solvers/dg.jl                             | 47 +++++++++++++++----
 6 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/Project.toml b/Project.toml
index 875d2ae6db1..27136900dc3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,6 +17,7 @@ EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -82,6 +83,7 @@ EllipsisNotation = "1.0"
 FillArrays = "1.9"
 ForwardDiff = "0.10.36, 1"
 HDF5 = "0.16.10, 0.17"
+KernelAbstractions = "0.9"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 4c0f5744a88..8a01d55f632 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -1,8 +1,6 @@
-# The same setup as tree_2d_dgsem/elixir_advection_basic.jl
-# to verify the StructuredMesh implementation against TreeMesh
-
-using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using OrdinaryDiffEqLowStorageRK
 using Trixi
+using CUDA
 
 ###############################################################################
 # semidiscretization of the linear advection equation
@@ -31,7 +29,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = CuArray)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
@@ -48,8 +46,8 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
-                        stepsize_callback)
+callbacks = CallbackSet(summary_callback)
+# analysis_callback, save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation
@@ -58,6 +56,6 @@ callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
 #       Uncomment the calls below to discover missing functionality.
 
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
-# sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-#             dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
-#             ode_default_options()..., callback = callbacks);
+ sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+             dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+             ode_default_options()..., callback = callbacks);
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a52dfd6d973..7836f1938b1 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -59,6 +59,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
+using KernelAbstractions
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index edc42db382b..40aff873956 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -405,4 +405,8 @@ end
 function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
     return unsafe_wrap_or_alloc(Storage, vec, size)
 end
+
+function KernelAbstractions.get_backend(semi::AbstractSemidiscretization)
+    KernelAbstractions.get_backend(semi.cache.elements.node_coordinates)
+end
 end # @muladd
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index 97c50aa46a1..e214f569d13 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -176,7 +176,8 @@ Same as [`compute_coefficients`](@ref) but stores the result in `u_ode`.
 function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization)
     u = wrap_array(u_ode, semi)
     # Call `compute_coefficients` defined by the solver
-    compute_coefficients!(u, func, t, mesh_equations_solver_cache(semi)...)
+    backend = get_backend(semi)
+    compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...)
 end
 
 """
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 78f3901a346..273cc8f7a47 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -642,8 +642,10 @@ include("fdsbp_unstructured/fdsbp.jl")
 function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache)
     # We must allocate a `Vector` in order to be able to `resize!` it (AMR).
     # cf. wrap_array
-    zeros(eltype(cache.elements),
-          nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+    u_ode = similar(cache.elements.node_coordinates,
+                    nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+    fill!(u_ode, zero(eltype(u_ode)))
+    return u_ode
 end
 
 @inline function wrap_array(u_ode::AbstractVector, mesh::AbstractMesh, equations,
@@ -686,7 +688,8 @@ end
         #  (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
     else
         # The following version is reasonably fast and allows us to `resize!(u_ode, ...)`.
-        unsafe_wrap(Array{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode),
+        ArrayType = Trixi.storage_type(u_ode)
+        unsafe_wrap(ArrayType{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode),
                     (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))...,
                      nelements(dg, cache)))
     end
@@ -756,15 +759,39 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
+function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
                                cache)
+    @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
-                                     j, element)
-            u_node = func(x_node, t, equations)
-            set_node_vars!(u, u_node, equations, dg, i, j, element)
-        end
+        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+    end
+end
+
+function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
+                               equations, dg::DG, cache)
+    nelements(dg, cache) == 0 && return nothing
+    # 1 cache not as argument
+    # 2 mesh not
+    @unpack node_coordinates = cache.elements
+    kernel! = compute_coefficients_kernel!(backend)
+    kernel!(u, func, t, equations, dg, node_coordinates,
+            ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function compute_coefficients_kernel!(u, func, t, equations,
+                                              dg::DG, node_coordinates)
+    element = @index(Global)
+    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+end
+
+function compute_coefficients_element!(u, func, t, equations, dg::DG,
+                                       node_coordinates, element)
+    for j in eachnode(dg), i in eachnode(dg)
+        x_node = get_node_coords(node_coordinates, equations, dg, i,
+                                    j, element)
+        u_node = func(x_node, t, equations)
+        set_node_vars!(u, u_node, equations, dg, i, j, element)
     end
 end
 

From 47a55f2ebea76a410e53e7a40f389587af95315f Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 1 Jul 2025 15:16:07 +0200
Subject: [PATCH 42/81] fmt

---
 examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl |  6 +++---
 src/solvers/dg.jl                                     | 11 +++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 8a01d55f632..8fd7c31a413 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -56,6 +56,6 @@ callbacks = CallbackSet(summary_callback)
 #       Uncomment the calls below to discover missing functionality.
 
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
- sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-             dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
-             ode_default_options()..., callback = callbacks);
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 273cc8f7a47..756036a0e55 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -643,7 +643,8 @@ function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache)
     # We must allocate a `Vector` in order to be able to `resize!` it (AMR).
     # cf. wrap_array
     u_ode = similar(cache.elements.node_coordinates,
-                    nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+                    nvariables(equations) * nnodes(dg)^ndims(mesh) *
+                    nelements(dg, cache))
     fill!(u_ode, zero(eltype(u_ode)))
     return u_ode
 end
@@ -759,11 +760,13 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
+function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations,
+                               dg::DG,
                                cache)
     @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
-        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
+                                      element)
     end
 end
 
@@ -789,7 +792,7 @@ function compute_coefficients_element!(u, func, t, equations, dg::DG,
                                        node_coordinates, element)
     for j in eachnode(dg), i in eachnode(dg)
         x_node = get_node_coords(node_coordinates, equations, dg, i,
-                                    j, element)
+                                 j, element)
         u_node = func(x_node, t, equations)
         set_node_vars!(u, u_node, equations, dg, i, j, element)
     end

From 36b0e4aae600e79a3168249e97994855e7bb81dc Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 2 Jul 2025 09:11:54 +0200
Subject: [PATCH 43/81] test native version as well

---
 .../elixir_advection_basic_gpu.jl             |  9 +++--
 src/Trixi.jl                                  |  1 +
 src/auxiliary/containers.jl                   |  8 +++++
 src/semidiscretization/semidiscretization.jl  |  2 +-
 src/solvers/dg.jl                             |  7 ++--
 test/test_cuda.jl                             | 35 ++++++++++++++++---
 6 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 8fd7c31a413..61277a2734f 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -1,6 +1,5 @@
 using OrdinaryDiffEqLowStorageRK
 using Trixi
-using CUDA
 
 ###############################################################################
 # semidiscretization of the linear advection equation
@@ -29,7 +28,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = CuArray)
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
@@ -56,6 +55,6 @@ callbacks = CallbackSet(summary_callback)
 #       Uncomment the calls below to discover missing functionality.
 
 # # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
-sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
-            ode_default_options()..., callback = callbacks);
+#sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+#            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+#            ode_default_options()..., callback = callbacks);
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 7836f1938b1..18000e050bd 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -20,6 +20,7 @@ const _PREFERENCE_SQRT = @load_preference("sqrt", "sqrt_Trixi_NaN")
 const _PREFERENCE_LOG = @load_preference("log", "log_Trixi_NaN")
 const _PREFERENCE_POLYESTER = @load_preference("polyester", true)
 const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", true)
+const _PREFERENCE_USE_NATIVE_THREADING = @load_preference("native_threading", true)
 
 # Include other packages that are used in Trixi.jl
 # (standard library packages first, other packages next, all of them sorted alphabetically)
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 40aff873956..ac412eb2da8 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -406,6 +406,14 @@ function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage
     return unsafe_wrap_or_alloc(Storage, vec, size)
 end
 
+function trixi_backend(x)
+    backend = get_backend(x)
+    if _PREFERENCE_USE_NATIVE_THREADING && backend isa KernelAbstractions.CPU
+        backend = nothing
+    end
+    return backend
+end
+
 function KernelAbstractions.get_backend(semi::AbstractSemidiscretization)
     KernelAbstractions.get_backend(semi.cache.elements.node_coordinates)
 end
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index e214f569d13..b8f53237550 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -176,7 +176,7 @@ Same as [`compute_coefficients`](@ref) but stores the result in `u_ode`.
 function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization)
     u = wrap_array(u_ode, semi)
     # Call `compute_coefficients` defined by the solver
-    backend = get_backend(semi)
+    backend = trixi_backend(semi)
     compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...)
 end
 
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 756036a0e55..9ec37647c97 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -760,9 +760,8 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(backend::Any, func, t, mesh::AbstractMesh{2}, equations,
-                               dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2},
+                               equations, dg::DG, cache)
     @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
         compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
@@ -773,8 +772,6 @@ end
 function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
                                equations, dg::DG, cache)
     nelements(dg, cache) == 0 && return nothing
-    # 1 cache not as argument
-    # 2 mesh not
     @unpack node_coordinates = cache.elements
     kernel! = compute_coefficients_kernel!(backend)
     kernel!(u, func, t, equations, dg, node_coordinates,
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 1f96d8c863e..c6904b41a9d 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -11,16 +11,41 @@ isdir(outdir) && rm(outdir, recursive = true)
 
 EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
 
-@trixi_testset "elixir_advection_basic_gpu.jl" begin
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=nothing,   # [Float32(8.311947673061856e-6)],
+                        linf=nothing,)
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin
     # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
     using CUDA
-    # TODO(benegee/vchuravy): Port compute_coefficients! to KernelAbstractions.jl
-    CUDA.allowscalar(true)
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
                         l2=nothing,   # [Float32(8.311947673061856e-6)],
                         linf=nothing, # [Float32(6.627000273229378e-5)],
-                        RealT=Float32,
                         real_type=Float32,
                         storage_type=CuArray)
     # # Ensure that we do not have excessive memory allocations
@@ -37,7 +62,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
     # TODO: remake ignores the mesh itself as well
     @test real(ode.p.mesh) == Float64
 
-    @test_broken ode.u0 isa CuArray
+    @test ode.u0 isa CuArray
     @test ode.p.solver.basis.derivative_matrix isa CuArray
 
     @test Trixi.storage_type(ode.p.cache.elements) === CuArray

From 153d8289418e33574425eafcdc443aeae52b5441 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 2 Jul 2025 09:34:33 +0200
Subject: [PATCH 44/81] adapt 1D and 3D version

---
 src/solvers/dg.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 9ec37647c97..a9ed65d7070 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -739,8 +739,8 @@ end
                  nelements(dg, cache)))
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{1},
+                               equations, dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for i in eachnode(dg)
             x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
@@ -795,8 +795,8 @@ function compute_coefficients_element!(u, func, t, equations, dg::DG,
     end
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{3}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3},
+                               equations, dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,

From 819ba7525c534568c3a127a6e371e2995e6e92bf Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 2 Jul 2025 09:34:49 +0200
Subject: [PATCH 45/81] Downgrade compat with Adapt

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 27136900dc3..51614052357 100644
--- a/Project.toml
+++ b/Project.toml
@@ -83,7 +83,7 @@ EllipsisNotation = "1.0"
 FillArrays = "1.9"
 ForwardDiff = "0.10.36, 1"
 HDF5 = "0.16.10, 0.17"
-KernelAbstractions = "0.9"
+KernelAbstractions = "0.9.15"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"

From e75cac7dbaf1eec9d45776a90125c541e57762f5 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 2 Jul 2025 10:41:15 +0200
Subject: [PATCH 46/81] update requires to 1.3

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 51614052357..fa88a560ed2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -101,7 +101,7 @@ Printf = "1"
 RecipesBase = "1.3.4"
 RecursiveArrayTools = "3.31.1"
 Reexport = "1.2"
-Requires = "1.1"
+Requires = "1.3"
 SciMLBase = "2.67.0"
 SimpleUnPack = "1.1"
 SparseArrays = "1"

From e7cde27d80f50658d9061372ecd17e1980de9440 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 16 Sep 2025 11:04:49 +0200
Subject: [PATCH 47/81] missed during merge

---
 src/solvers/dgsem_p4est/containers.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index c8db5388e77..3f74f699f19 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -223,13 +223,8 @@ mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
                                        IndicesVector <:
                                        DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-<<<<<<< HEAD
-    u::uArray       # [primary/secondary, variable, i, j, interface]
-    neighbor_ids::IdsMatrix                   # [primary/secondary, interface]
-=======
     u::uArray                   # [primary/secondary, variable, i, j, interface]
     neighbor_ids::IdsMatrix     # [primary/secondary, interface]
->>>>>>> main
     node_indices::IndicesMatrix # [primary/secondary, interface]
 
     # internal `resize!`able storage

From b174d6d9e5c0d66afd05bea3885952e069e2d5e4 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 16 Sep 2025 13:19:28 +0200
Subject: [PATCH 48/81] mistakes during merge

---
 src/Trixi.jl                                 | 1 -
 src/semidiscretization/semidiscretization.jl | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index d98920bcf0b..9412c33db6f 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -20,7 +20,6 @@ const _PREFERENCE_SQRT = @load_preference("sqrt", "sqrt_Trixi_NaN")
 const _PREFERENCE_LOG = @load_preference("log", "log_Trixi_NaN")
 const _PREFERENCE_THREADING = Symbol(@load_preference("backend", "polyester"))
 const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", true)
-const _PREFERENCE_USE_NATIVE_THREADING = @load_preference("native_threading", true)
 
 # Include other packages that are used in Trixi.jl
 # (standard library packages first, other packages next, all of them sorted alphabetically)
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index ef2847ced6a..a629ff64f0d 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -230,7 +230,6 @@ function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization)
     backend = trixi_backend(u_ode)
     u = wrap_array(u_ode, semi)
     # Call `compute_coefficients` defined by the solver
-    backend = trixi_backend(semi)
     compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...)
 end
 

From 489bb24933d57c68799b15ea8bf6efcbf09f597e Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 18 Sep 2025 12:02:12 +0200
Subject: [PATCH 49/81] cleanup

---
 Project.toml                | 2 --
 src/auxiliary/containers.jl | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8eb7aa80e5b..e898cdf144b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -59,7 +59,6 @@ Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
 
 [extensions]
@@ -67,7 +66,6 @@ TrixiCUDAExt = "CUDA"
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
-TrixiCUDAExt = "CUDA"
 TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
 
 [compat]
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 874b238f1cf..5036863ff4b 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -380,10 +380,6 @@ function trixi_backend(x::VectorOfArray)
     return get_backend(u[1])
 end
 
-function KernelAbstractions.get_backend(semi::AbstractSemidiscretization)
-    KernelAbstractions.get_backend(semi.cache.elements.node_coordinates)
-end
-
 # For some storage backends like CUDA.jl, empty arrays do seem to simply be
 # null pointers which can cause `unsafe_wrap` to fail when calling
 # Adapt.adapt (ArgumentError, see

From b4d15354e80eb796bf4f17f2769444afc9faabdc Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 18 Sep 2025 12:05:02 +0200
Subject: [PATCH 50/81] Basis kernels for 3D P4est

- prolong2interfaces
- calc_interface_flux
- calc_surface_integral
- calc_volume_integral (weak_form_kernel)
- apply_jacobian
---
 .../semidiscretization_hyperbolic.jl          |   3 +-
 src/solvers/dg.jl                             |  53 +-
 src/solvers/dgsem_p4est/dg_3d.jl              | 455 +++++++++++-------
 src/solvers/dgsem_p4est/dg_3d_parallel.jl     |   2 +-
 src/solvers/dgsem_structured/dg_1d.jl         |   2 +-
 src/solvers/dgsem_structured/dg_2d.jl         |   2 +-
 src/solvers/dgsem_structured/dg_3d.jl         |  57 ++-
 src/solvers/dgsem_tree/dg_1d.jl               |   2 +-
 src/solvers/dgsem_tree/dg_2d.jl               |   2 +-
 src/solvers/dgsem_tree/dg_2d_parallel.jl      |   2 +-
 src/solvers/dgsem_tree/dg_3d.jl               |  45 +-
 src/solvers/dgsem_unstructured/dg_2d.jl       |   2 +-
 12 files changed, 396 insertions(+), 231 deletions(-)

diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index 2a563c02229..b49c18cbd37 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -399,10 +399,11 @@ function rhs!(du_ode, u_ode, semi::SemidiscretizationHyperbolic, t)
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
     du = wrap_array(du_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
     # TODO: Taal decide, do we need to pass the mesh?
     time_start = time_ns()
-    @trixi_timeit timer() "rhs!" rhs!(du, u, t, mesh, equations,
+    @trixi_timeit timer() "rhs!" rhs!(backend, du, u, t, mesh, equations,
                                       boundary_conditions, source_terms, solver, cache)
     runtime = time_ns() - time_start
     put!(semi.performance_counter, runtime)
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 509c12dab95..f402aad2ebd 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -610,6 +610,13 @@ end
     return u_ll, u_rr
 end
 
+# As above but dispatches on an type argument
+@inline function get_surface_node_vars(u, equations, ::Type{<:DG}, indices...)
+    u_ll = SVector(ntuple(@inline(v->u[1, v, indices...]), Val(nvariables(equations))))
+    u_rr = SVector(ntuple(@inline(v->u[2, v, indices...]), Val(nvariables(equations))))
+    return u_ll, u_rr
+end
+
 @inline function set_node_vars!(u, u_node, equations, solver::DG, indices...)
     for v in eachvariable(equations)
         u[v, indices...] = u_node[v]
@@ -774,54 +781,46 @@ function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{
     return nothing
 end
 
-function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2},
+function compute_coefficients!(backend::Nothing, u, func, t,
+                               mesh::Union{AbstractMesh{2}, AbstractMesh{3}},
                                equations, dg::DG, cache)
     @unpack node_coordinates = cache.elements
+    node_indices = CartesianIndices(ntuple(_ -> nnodes(dg), ndims(mesh)))
     @threaded for element in eachelement(dg, cache)
         compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
-                                      element)
+                                      element, node_indices)
     end
 
     return nothing
 end
 
-function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
+function compute_coefficients!(backend::Backend, u, func, t,
+                               mesh::Union{AbstractMesh{2}, AbstractMesh{3}},
                                equations, dg::DG, cache)
     nelements(dg, cache) == 0 && return nothing
+
     @unpack node_coordinates = cache.elements
-    kernel! = compute_coefficients_kernel!(backend)
-    kernel!(u, func, t, equations, dg, node_coordinates,
+    node_indices = CartesianIndices(ntuple(_ -> nnodes(dg), ndims(mesh)))
+
+    kernel! = compute_coefficients_KAkernel!(backend)
+    kernel!(u, func, t, equations, dg, node_coordinates, node_indices,
             ndrange = nelements(dg, cache))
     return nothing
 end
 
-@kernel function compute_coefficients_kernel!(u, func, t, equations,
-                                              dg::DG, node_coordinates)
+@kernel function compute_coefficients_KAkernel!(u, func, t, equations,
+                                                dg::DG, node_coordinates, node_indices)
     element = @index(Global)
-    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element,
+                                  node_indices)
 end
 
 function compute_coefficients_element!(u, func, t, equations, dg::DG,
-                                       node_coordinates, element)
-    for j in eachnode(dg), i in eachnode(dg)
-        x_node = get_node_coords(node_coordinates, equations, dg, i,
-                                 j, element)
+                                       node_coordinates, element, node_indices)
+    for indices in node_indices
+        x_node = get_node_coords(node_coordinates, equations, dg, indices, element)
         u_node = func(x_node, t, equations)
-        set_node_vars!(u, u_node, equations, dg, i, j, element)
-    end
-
-    return nothing
-end
-
-function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3},
-                               equations, dg::DG, cache)
-    @threaded for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
-                                     j, k, element)
-            u_node = func(x_node, t, equations)
-            set_node_vars!(u, u_node, equations, dg, i, j, k, element)
-        end
+        set_node_vars!(u, u_node, equations, dg, indices, element)
     end
 
     return nothing
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 63cf78ddd94..510f4d3c717 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -91,85 +91,116 @@ end
     return (i1, i2)
 end
 
-function prolong2interfaces!(cache, u,
+function prolong2interfaces!(backend::Nothing, cache, u,
                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                              equations, dg::DG)
     @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Copy solution data from the primary element using "delayed indexing" with
-        # a start value and two step sizes to get the correct face and orientation.
-        # Note that in the current implementation, the interface will be
-        # "aligned at the primary element", i.e., the indices of the primary side
-        # will always run forwards.
-        primary_element = interfaces.neighbor_ids[1, interface]
-        primary_indices = interfaces.node_indices[1, interface]
-
-        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
-                                                                                     index_range)
-        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
-                                                                                     index_range)
-        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
-                                                                                     index_range)
-
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        k_primary = k_primary_start
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                for v in eachvariable(equations)
-                    interfaces.u[1, v, i, j, interface] = u[v,
-                                                            i_primary, j_primary,
-                                                            k_primary,
-                                                            primary_element]
-                end
-                i_primary += i_primary_step_i
-                j_primary += j_primary_step_i
-                k_primary += k_primary_step_i
+        prolong2interfaces_interface!(interfaces.u, u, typeof(mesh), equations,
+                                      neighbor_ids, node_indices, index_range,
+                                      interface)
+    end
+    return nothing
+end
+
+function prolong2interfaces!(backend::Backend, cache, u,
+                             mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                             equations, dg::DG)
+    @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    index_range = eachnode(dg)
+
+    kernel! = prolong2interfaces_KAkernel!(backend)
+    kernel!(interfaces.u, u, typeof(mesh), equations, neighbor_ids, node_indices,
+            index_range,
+            ndrange = ninterfaces(interfaces))
+    return nothing
+end
+
+@kernel function prolong2interfaces_KAkernel!(interface_u, u, meshT, equations,
+                                              neighbor_ids, node_indices, index_range)
+    interface = @index(Global)
+    prolong2interfaces_interface!(interface_u, u, meshT, equations, neighbor_ids,
+                                  node_indices, index_range, interface)
+end
+
+function prolong2interfaces_interface!(u_interface, u,
+                                       ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
+                                       equations, neighbor_ids, node_indices,
+                                       index_range, interface)
+    # Copy solution data from the primary element using "delayed indexing" with
+    # a start value and two step sizes to get the correct face and orientation.
+    # Note that in the current implementation, the interface will be
+    # "aligned at the primary element", i.e., the indices of the primary side
+    # will always run forwards.
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+
+    i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                 index_range)
+    j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                 index_range)
+    k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                 index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    k_primary = k_primary_start
+    for j in index_range
+        for i in index_range
+            for v in eachvariable(equations)
+                u_interface[1, v, i, j, interface] = u[v,
+                                                       i_primary, j_primary,
+                                                       k_primary,
+                                                       primary_element]
             end
-            i_primary += i_primary_step_j
-            j_primary += j_primary_step_j
-            k_primary += k_primary_step_j
+            i_primary += i_primary_step_i
+            j_primary += j_primary_step_i
+            k_primary += k_primary_step_i
         end
+        i_primary += i_primary_step_j
+        j_primary += j_primary_step_j
+        k_primary += k_primary_step_j
+    end
 
-        # Copy solution data from the secondary element using "delayed indexing" with
-        # a start value and two step sizes to get the correct face and orientation.
-        secondary_element = interfaces.neighbor_ids[2, interface]
-        secondary_indices = interfaces.node_indices[2, interface]
-
-        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
-                                                                                           index_range)
-        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
-                                                                                           index_range)
-        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
-                                                                                           index_range)
-
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
-        k_secondary = k_secondary_start
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                for v in eachvariable(equations)
-                    interfaces.u[2, v, i, j, interface] = u[v,
-                                                            i_secondary, j_secondary,
-                                                            k_secondary,
-                                                            secondary_element]
-                end
-                i_secondary += i_secondary_step_i
-                j_secondary += j_secondary_step_i
-                k_secondary += k_secondary_step_i
+    # Copy solution data from the secondary element using "delayed indexing" with
+    # a start value and two step sizes to get the correct face and orientation.
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+
+    i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                       index_range)
+    j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                       index_range)
+    k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                       index_range)
+
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+    k_secondary = k_secondary_start
+    for j in index_range
+        for i in index_range
+            for v in eachvariable(equations)
+                u_interface[2, v, i, j, interface] = u[v,
+                                                       i_secondary, j_secondary,
+                                                       k_secondary,
+                                                       secondary_element]
             end
-            i_secondary += i_secondary_step_j
-            j_secondary += j_secondary_step_j
-            k_secondary += k_secondary_step_j
+            i_secondary += i_secondary_step_i
+            j_secondary += j_secondary_step_i
+            k_secondary += k_secondary_step_i
         end
+        i_secondary += i_secondary_step_j
+        j_secondary += j_secondary_step_j
+        k_secondary += k_secondary_step_j
     end
-
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                               nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
@@ -178,92 +209,139 @@ function calc_interface_flux!(surface_flux_values,
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Get element and side information on the primary element
-        primary_element = neighbor_ids[1, interface]
-        primary_indices = node_indices[1, interface]
-        primary_direction = indices2direction(primary_indices)
-
-        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
-                                                                                     index_range)
-        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
-                                                                                     index_range)
-        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
-                                                                                     index_range)
-
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        k_primary = k_primary_start
-
-        # Get element and side information on the secondary element
-        secondary_element = neighbor_ids[2, interface]
-        secondary_indices = node_indices[2, interface]
-        secondary_direction = indices2direction(secondary_indices)
-        secondary_surface_indices = surface_indices(secondary_indices)
-
-        # Get the surface indexing on the secondary element.
-        # Note that the indices of the primary side will always run forward but
-        # the secondary indices might need to run backwards for flipped sides.
-        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
-                                                                                           index_range)
-        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
-                                                                                           index_range)
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
+        calc_interface_flux_interface!(surface_flux_values,
+                                       typeof(mesh),
+                                       nonconservative_terms,
+                                       equations, surface_integral, typeof(dg),
+                                       cache.interfaces.u, neighbor_ids, node_indices,
+                                       contravariant_vectors, index_range, interface)
+    end
+    return nothing
+end
+
+function calc_interface_flux!(backend::Backend, surface_flux_values,
+                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                              nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    kernel! = calc_interface_flux_KAkernel!(backend)
+    kernel!(surface_flux_values, typeof(mesh), nonconservative_terms, equations,
+            surface_integral, typeof(dg), cache.interfaces.u,
+            neighbor_ids, node_indices, contravariant_vectors, index_range,
+            ndrange = ninterfaces(cache.interfaces))
+    return nothing
+end
+
+@kernel function calc_interface_flux_KAkernel!(surface_flux_values, meshT,
+                                               nonconservative_terms, equations,
+                                               surface_integral, solverT, u_inferface,
+                                               neighbor_ids, node_indices,
+                                               contravariant_vectors, index_range)
+    interface = @index(Global)
+    calc_interface_flux_interface!(surface_flux_values,
+                                   meshT,
+                                   nonconservative_terms,
+                                   equations, surface_integral, solverT, u_inferface,
+                                   neighbor_ids, node_indices, contravariant_vectors,
+                                   index_range, interface)
+end
+
+function calc_interface_flux_interface!(surface_flux_values,
+                                        meshT::Type{<:Union{P4estMesh{3},
+                                                            T8codeMesh{3}}},
+                                        nonconservative_terms,
+                                        equations, surface_integral,
+                                        solverT::Type{<:DG}, u_interface, neighbor_ids,
+                                        node_indices, contravariant_vectors,
+                                        index_range, interface)
+    # Get element and side information on the primary element
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+    primary_direction = indices2direction(primary_indices)
+
+    i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                 index_range)
+    j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                 index_range)
+    k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                 index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    k_primary = k_primary_start
+
+    # Get element and side information on the secondary element
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+    secondary_direction = indices2direction(secondary_indices)
+    secondary_surface_indices = surface_indices(secondary_indices)
+
+    # Get the surface indexing on the secondary element.
+    # Note that the indices of the primary side will always run forward but
+    # the secondary indices might need to run backwards for flipped sides.
+    i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                       index_range)
+    j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                       index_range)
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+
+    for j in index_range
+        for i in index_range
+            # Get the normal direction from the primary element.
+            # Note, contravariant vectors at interfaces in negative coordinate direction
+            # are pointing inwards. This is handled by `get_normal_direction`.
+            normal_direction = get_normal_direction(primary_direction,
+                                                    contravariant_vectors,
+                                                    i_primary, j_primary, k_primary,
+                                                    primary_element)
+
+            calc_interface_flux!(surface_flux_values, meshT, nonconservative_terms,
+                                 equations,
+                                 surface_integral, solverT, u_interface,
+                                 interface, normal_direction,
+                                 i, j, primary_direction, primary_element,
+                                 i_secondary, j_secondary, secondary_direction,
+                                 secondary_element)
 
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                # Get the normal direction from the primary element.
-                # Note, contravariant vectors at interfaces in negative coordinate direction
-                # are pointing inwards. This is handled by `get_normal_direction`.
-                normal_direction = get_normal_direction(primary_direction,
-                                                        contravariant_vectors,
-                                                        i_primary, j_primary, k_primary,
-                                                        primary_element)
-
-                calc_interface_flux!(surface_flux_values, mesh, nonconservative_terms,
-                                     equations,
-                                     surface_integral, dg, cache,
-                                     interface, normal_direction,
-                                     i, j, primary_direction, primary_element,
-                                     i_secondary, j_secondary, secondary_direction,
-                                     secondary_element)
-
-                # Increment the primary element indices
-                i_primary += i_primary_step_i
-                j_primary += j_primary_step_i
-                k_primary += k_primary_step_i
-                # Increment the secondary element surface indices
-                i_secondary += i_secondary_step_i
-                j_secondary += j_secondary_step_i
-            end
             # Increment the primary element indices
-            i_primary += i_primary_step_j
-            j_primary += j_primary_step_j
-            k_primary += k_primary_step_j
+            i_primary += i_primary_step_i
+            j_primary += j_primary_step_i
+            k_primary += k_primary_step_i
             # Increment the secondary element surface indices
-            i_secondary += i_secondary_step_j
-            j_secondary += j_secondary_step_j
+            i_secondary += i_secondary_step_i
+            j_secondary += j_secondary_step_i
         end
+        # Increment the primary element indices
+        i_primary += i_primary_step_j
+        j_primary += j_primary_step_j
+        k_primary += k_primary_step_j
+        # Increment the secondary element surface indices
+        i_secondary += i_secondary_step_j
+        j_secondary += j_secondary_step_j
     end
-
     return nothing
 end
 
 # Inlined function for interface flux computation for conservative flux terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       nonconservative_terms::False, equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index,
                                        primary_j_node_index, interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -813,7 +891,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                 equations,
                                 surface_integral::SurfaceIntegralWeakForm,
@@ -821,51 +899,86 @@ function calc_surface_integral!(du, u,
     @unpack boundary_interpolation = dg.basis
     @unpack surface_flux_values = cache.elements
 
+    @threaded for element in eachelement(dg, cache)
+        calc_surface_integral_element!(du, typeof(mesh),
+                                       equations,
+                                       surface_integral, dg, surface_flux_values,
+                                       element)
+    end
+    return nothing
+end
+
+function calc_surface_integral!(backend::Backend, du, u,
+                                mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    @unpack boundary_interpolation = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    kernel! = calc_surface_integral_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, surface_integral, dg, surface_flux_values,
+            ndrange = nelements(cache.elements))
+    return nothing
+end
+
+@kernel function calc_surface_integral_KAkernel!(du, meshT, equations,
+                                                 surface_integral, dg,
+                                                 surface_flux_values)
+    element = @index(Global)
+    calc_surface_integral_element!(du, meshT,
+                                   equations,
+                                   surface_integral, dg, surface_flux_values, element)
+end
+
+function calc_surface_integral_element!(du,
+                                        ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
+                                        equations,
+                                        surface_integral::SurfaceIntegralWeakForm,
+                                        dg::DGSEM, surface_flux_values, element)
     # Note that all fluxes have been computed with outward-pointing normal vectors.
     # Access the factors only once before beginning the loop to increase performance.
     # We also use explicit assignments instead of `+=` to let `@muladd` turn these
     # into FMAs (see comment at the top of the file).
-    factor_1 = boundary_interpolation[1, 1]
-    factor_2 = boundary_interpolation[nnodes(dg), 2]
-    @threaded for element in eachelement(dg, cache)
-        for m in eachnode(dg), l in eachnode(dg)
-            for v in eachvariable(equations)
-                # surface at -x
-                du[v, 1, l, m, element] = (du[v, 1, l, m, element] +
-                                           surface_flux_values[v, l, m, 1, element] *
-                                           factor_1)
-
-                # surface at +x
-                du[v, nnodes(dg), l, m, element] = (du[v, nnodes(dg), l, m, element] +
-                                                    surface_flux_values[v, l, m, 2,
-                                                                        element] *
-                                                    factor_2)
-
-                # surface at -y
-                du[v, l, 1, m, element] = (du[v, l, 1, m, element] +
-                                           surface_flux_values[v, l, m, 3, element] *
-                                           factor_1)
-
-                # surface at +y
-                du[v, l, nnodes(dg), m, element] = (du[v, l, nnodes(dg), m, element] +
-                                                    surface_flux_values[v, l, m, 4,
-                                                                        element] *
-                                                    factor_2)
-
-                # surface at -z
-                du[v, l, m, 1, element] = (du[v, l, m, 1, element] +
-                                           surface_flux_values[v, l, m, 5, element] *
-                                           factor_1)
-
-                # surface at +z
-                du[v, l, m, nnodes(dg), element] = (du[v, l, m, nnodes(dg), element] +
-                                                    surface_flux_values[v, l, m, 6,
-                                                                        element] *
-                                                    factor_2)
-            end
+    # TODO GPU: dg is adapted, accessing scalars outside of kernel is therefor not useful
+    factor_1 = dg.basis.boundary_interpolation[1, 1]
+    factor_2 = dg.basis.boundary_interpolation[nnodes(dg), 2]
+    for m in eachnode(dg), l in eachnode(dg)
+        for v in eachvariable(equations)
+            # surface at -x
+            du[v, 1, l, m, element] = (du[v, 1, l, m, element] +
+                                       surface_flux_values[v, l, m, 1, element] *
+                                       factor_1)
+
+            # surface at +x
+            du[v, nnodes(dg), l, m, element] = (du[v, nnodes(dg), l, m, element] +
+                                                surface_flux_values[v, l, m, 2,
+                                                                    element] *
+                                                factor_2)
+
+            # surface at -y
+            du[v, l, 1, m, element] = (du[v, l, 1, m, element] +
+                                       surface_flux_values[v, l, m, 3, element] *
+                                       factor_1)
+
+            # surface at +y
+            du[v, l, nnodes(dg), m, element] = (du[v, l, nnodes(dg), m, element] +
+                                                surface_flux_values[v, l, m, 4,
+                                                                    element] *
+                                                factor_2)
+
+            # surface at -z
+            du[v, l, m, 1, element] = (du[v, l, m, 1, element] +
+                                       surface_flux_values[v, l, m, 5, element] *
+                                       factor_1)
+
+            # surface at +z
+            du[v, l, m, nnodes(dg), element] = (du[v, l, m, nnodes(dg), element] +
+                                                surface_flux_values[v, l, m, 6,
+                                                                    element] *
+                                                factor_2)
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index 520bc1c0599..276ddd9fb56 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{ParallelP4estMesh{3}, ParallelT8codeMesh{3}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
diff --git a/src/solvers/dgsem_structured/dg_1d.jl b/src/solvers/dgsem_structured/dg_1d.jl
index ee2832e66a8..d85e4bab7a9 100644
--- a/src/solvers/dgsem_structured/dg_1d.jl
+++ b/src/solvers/dgsem_structured/dg_1d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::StructuredMesh{1}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index a02a44bf4dd..2979bf1b254 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{StructuredMesh{2}, StructuredMeshView{2}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index aba79f3a5a5..0ad3fca68b8 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::StructuredMesh{3}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -56,16 +56,17 @@ see `flux_differencing_kernel!`.
 This treatment is required to achieve, e.g., entropy-stability or well-balancedness.
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
-@inline function weak_form_kernel!(du, u,
-                                   element,
-                                   mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                               T8codeMesh{3}},
-                                   nonconservative_terms::False, equations,
-                                   dg::DGSEM, cache, alpha = true)
+@inline function weak_form_kernel_element!(du, u,
+                                           element,
+                                           ::Type{<:Union{StructuredMesh{3},
+                                                          P4estMesh{3},
+                                                          T8codeMesh{3}}},
+                                           nonconservative_terms::False, equations,
+                                           dg::DGSEM, contravariant_vectors,
+                                           alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
     # This can (hopefully) be optimized away due to constant propagation.
     @unpack derivative_dhat = dg.basis
-    @unpack contravariant_vectors = cache.elements
 
     for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         u_node = get_node_vars(u, equations, dg, i, j, k, element)
@@ -800,19 +801,45 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du,
+function apply_jacobian!(backend::Nothing, du,
                          mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                          equations, dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
     @threaded for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            factor = -cache.elements.inverse_jacobian[i, j, k, element]
+        apply_jacobian_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
+                                element)
+    end
+    return nothing
+end
 
-            for v in eachvariable(equations)
-                du[v, i, j, k, element] *= factor
-            end
+function apply_jacobian!(backend::Backend, du,
+                         mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+                         equations, dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
+
+    kernel! = apply_jacobian_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
+            ndrange = nelements(cache.elements))
+    return nothing
+end
+
+@kernel function apply_jacobian_KAkernel!(du, meshT, equations, dg::DG,
+                                          inverse_jacobian)
+    element = @index(Global)
+    apply_jacobian_element!(du, meshT, equations, dg, inverse_jacobian, element)
+end
+
+function apply_jacobian_element!(du,
+                                 ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                                T8codeMesh{3}}},
+                                 equations, dg, inverse_jacobian, element)
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        factor = -inverse_jacobian[i, j, k, element]
+
+        for v in eachvariable(equations)
+            du[v, i, j, k, element] *= factor
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index 659a3babdcc..b0528a341ef 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -67,7 +67,7 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::TreeMesh{1}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 8b30219d29b..e7ca6b19dcb 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -112,7 +112,7 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{TreeMesh{2}, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
               equations,
               boundary_conditions, source_terms::Source,
diff --git a/src/solvers/dgsem_tree/dg_2d_parallel.jl b/src/solvers/dgsem_tree/dg_2d_parallel.jl
index cb522aa3eaa..ef8b57c93d8 100644
--- a/src/solvers/dgsem_tree/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parallel.jl
@@ -447,7 +447,7 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mpi_mortars,
     return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars
 end
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{ParallelTreeMesh{2}, ParallelP4estMesh{2},
                           ParallelT8codeMesh{2}}, equations,
               boundary_conditions, source_terms::Source,
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 7c8f5e0749c..f6147eb5056 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -159,7 +159,7 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{TreeMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -168,19 +168,19 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -212,12 +212,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -227,21 +228,45 @@ function rhs!(du, u, t,
     return nothing
 end
 
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                nonconservative_terms, equations,
                                volume_integral::VolumeIntegralWeakForm,
                                dg::DGSEM, cache)
+    @unpack contravariant_vectors = cache.elements
     @threaded for element in eachelement(dg, cache)
-        weak_form_kernel!(du, u, element, mesh,
-                          nonconservative_terms, equations,
-                          dg, cache)
+        weak_form_kernel_element!(du, u, element, typeof(mesh),
+                                  nonconservative_terms, equations,
+                                  dg, contravariant_vectors)
     end
+    return nothing
+end
+
+function calc_volume_integral!(backend::Backend, du, u,
+                               mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
+                                           T8codeMesh{3}},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralWeakForm,
+                               dg::DGSEM, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack contravariant_vectors = cache.elements
 
+    kernel! = weak_form_KAkernel!(backend)
+    kernel!(du, u, typeof(mesh), nonconservative_terms, equations, dg,
+            contravariant_vectors,
+            ndrange = nelements(dg, cache))
     return nothing
 end
 
+@kernel function weak_form_KAkernel!(du, u, meshT, nonconservative_terms, equations,
+                                     dg::DGSEM, contravariant_vectors)
+    element = @index(Global)
+    weak_form_kernel_element!(du, u, element, meshT,
+                              nonconservative_terms, equations,
+                              dg, contravariant_vectors)
+end
+
 #=
 `weak_form_kernel!` is only implemented for conserved terms as
 non-conservative terms should always be discretized in conjunction with a flux-splitting scheme,
diff --git a/src/solvers/dgsem_unstructured/dg_2d.jl b/src/solvers/dgsem_unstructured/dg_2d.jl
index 4f90ba11a46..27554ffd320 100644
--- a/src/solvers/dgsem_unstructured/dg_2d.jl
+++ b/src/solvers/dgsem_unstructured/dg_2d.jl
@@ -34,7 +34,7 @@ function create_cache(mesh::UnstructuredMesh2D, equations,
     return cache
 end
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::UnstructuredMesh2D, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}

From 2443cf85193ff8ef418fce7a969ba5f1c9c26bf1 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 18 Sep 2025 12:06:14 +0200
Subject: [PATCH 51/81] port stepsize computation

---
 src/callbacks_step/stepsize.jl      |   6 +-
 src/callbacks_step/stepsize_dg1d.jl |   8 +--
 src/callbacks_step/stepsize_dg2d.jl |  20 +++---
 src/callbacks_step/stepsize_dg3d.jl | 108 +++++++++++++++++++---------
 src/solvers/dgmulti/dg.jl           |   4 +-
 5 files changed, 96 insertions(+), 50 deletions(-)

diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index eac6f54261c..d643e91bd8d 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -118,8 +118,9 @@ end
 function calculate_dt(u_ode, t, cfl_number::Real, semi::AbstractSemidiscretization)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    dt = cfl_number * max_dt(u, t, mesh,
+    dt = cfl_number * max_dt(backend, u, t, mesh,
                 have_constant_speed(equations), equations,
                 solver, cache)
 end
@@ -127,8 +128,9 @@ end
 function calculate_dt(u_ode, t, cfl_number, semi::AbstractSemidiscretization)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    dt = cfl_number(t) * max_dt(u, t, mesh,
+    dt = cfl_number(t) * max_dt(backend, u, t, mesh,
                 have_constant_speed(equations), equations,
                 solver, cache)
 end
diff --git a/src/callbacks_step/stepsize_dg1d.jl b/src/callbacks_step/stepsize_dg1d.jl
index 7be0f074135..cfaa3adff2d 100644
--- a/src/callbacks_step/stepsize_dg1d.jl
+++ b/src/callbacks_step/stepsize_dg1d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend, u, t, mesh::TreeMesh{1},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -25,7 +25,7 @@ function max_dt(u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend, u, t, mesh::TreeMesh{1},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -41,7 +41,7 @@ function max_dt(u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::StructuredMesh{1},
+function max_dt(backend, u, t, mesh::StructuredMesh{1},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -65,7 +65,7 @@ function max_dt(u, t, mesh::StructuredMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::StructuredMesh{1},
+function max_dt(backend, u, t, mesh::StructuredMesh{1},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index a7c0dd2a0af..0d3e798b28f 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(u, t, mesh::TreeMesh{2},
+function max_dt(backend, u, t, mesh::TreeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -27,7 +27,7 @@ function max_dt(u, t, mesh::TreeMesh{2},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{2},
+function max_dt(backend, u, t, mesh::TreeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -44,7 +44,7 @@ function max_dt(u, t, mesh::TreeMesh{2},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::ParallelTreeMesh{2},
+function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::TreeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -61,7 +61,7 @@ function max_dt(u, t, mesh::ParallelTreeMesh{2},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelTreeMesh{2},
+function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::TreeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -78,7 +78,7 @@ function max_dt(u, t, mesh::ParallelTreeMesh{2},
     return dt
 end
 
-function max_dt(u, t,
+function max_dt(backend, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::False, equations, dg::DG, cache)
@@ -114,7 +114,7 @@ function max_dt(u, t,
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t,
+function max_dt(backend, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::True, equations, dg::DG, cache)
@@ -146,7 +146,7 @@ function max_dt(u, t,
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::ParallelP4estMesh{2},
+function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -163,7 +163,7 @@ function max_dt(u, t, mesh::ParallelP4estMesh{2},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelP4estMesh{2},
+function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -180,7 +180,7 @@ function max_dt(u, t, mesh::ParallelP4estMesh{2},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelT8codeMesh{2},
+function max_dt(backend, u, t, mesh::ParallelT8codeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -197,7 +197,7 @@ function max_dt(u, t, mesh::ParallelT8codeMesh{2},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelT8codeMesh{2},
+function max_dt(backend, u, t, mesh::ParallelT8codeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index 897f7d8b22b..159dca720d6 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(u, t, mesh::TreeMesh{3},
+function max_dt(backend, u, t, mesh::TreeMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -28,7 +28,7 @@ function max_dt(u, t, mesh::TreeMesh{3},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{3},
+function max_dt(backend, u, t, mesh::TreeMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -45,51 +45,95 @@ function max_dt(u, t, mesh::TreeMesh{3},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+function max_dt(backend::Nothing, u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                 constant_speed::False, equations, dg::DG, cache)
+
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
 
-    @unpack contravariant_vectors = cache.elements
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
 
     @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        max_lambda1 = max_lambda2 = max_lambda3 = zero(max_scaled_speed)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            u_node = get_node_vars(u, equations, dg, i, j, k, element)
-            lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)
+        max_lambda = max_scaled_speed_element(u, typeof(mesh), equations, dg,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+        max_scaled_speed = max(max_scaled_speed, max_lambda)
+    end
 
-            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2 + Ja13 * lambda3)
-            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2 + Ja23 * lambda3)
-            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda3_transformed = abs(Ja31 * lambda1 + Ja32 * lambda2 + Ja33 * lambda3)
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-            inv_jacobian = abs(cache.elements.inverse_jacobian[i, j, k, element])
+function max_dt(backend::Backend, u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+                constant_speed::False, equations, dg::DG, cache)
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    num_elements = nelements(dg, cache)
+    max_scaled_speeds = allocate(backend, eltype(t), num_elements)
 
-            max_lambda1 = max(max_lambda1, inv_jacobian * lambda1_transformed)
-            max_lambda2 = max(max_lambda2, inv_jacobian * lambda2_transformed)
-            max_lambda3 = max(max_lambda3, inv_jacobian * lambda3_transformed)
-        end
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), equations, dg, contravariant_vectors,
+            inverse_jacobian;
+            ndrange = num_elements)
 
-        max_scaled_speed = max(max_scaled_speed,
-                               max_lambda1 + max_lambda2 + max_lambda3)
-    end
+    # TODO GPU dt on CPU? (time integration happens on CPU)
+    max_scaled_speed = max(nextfloat(zero(t)), maximum(max_scaled_speeds))
 
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, meshT, equations,
+                                            dg, contravariant_vectors, inverse_jacobian)
+    element = @index(Global)
+    max_scaled_speeds[element] = max_scaled_speed_element(du, meshT,
+                                                          equations,
+                                                          surface_integral, dg,
+                                                          surface_flux_values, element)
+end
+
+function max_scaled_speed_element(u,
+                                  ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                                 T8codeMesh{3}}}, equations, dg,
+                                  contravariant_vectors, inverse_jacobian, element)
+    max_lambda1 = max_lambda2 = max_lambda3 = zero(max_scaled_speed)
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)
+
+        Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2 + Ja13 * lambda3)
+        Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2 + Ja23 * lambda3)
+        Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda3_transformed = abs(Ja31 * lambda1 + Ja32 * lambda2 + Ja33 * lambda3)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, k, element])
+
+        max_lambda1 = max(max_lambda1, inv_jacobian * lambda1_transformed)
+        max_lambda2 = max(max_lambda2, inv_jacobian * lambda2_transformed)
+        max_lambda3 = max(max_lambda3, inv_jacobian * lambda3_transformed)
+    end
+    return max_lambda1 + max_lambda2 + max_lambda3
+end
+
+function max_dt(backend, u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
 
-    @unpack contravariant_vectors = cache.elements
+    if backend isa Nothing  # TODO GPU KA CPU backend as well
+        @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    else
+        # TODO GPU is this sufficient?
+        contravariant_vectors = Array(cache.elements.contravariant_vectors)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+    end
 
     max_lambda1, max_lambda2, max_lambda3 = max_abs_speeds(equations)
 
@@ -108,7 +152,7 @@ function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}
             lambda3_transformed = abs(Ja31 * max_lambda1 + Ja32 * max_lambda2 +
                                       Ja33 * max_lambda3)
 
-            inv_jacobian = abs(cache.elements.inverse_jacobian[i, j, k, element])
+            inv_jacobian = abs(inverse_jacobian[i, j, k, element])
 
             max_scaled_speed = max(max_scaled_speed,
                                    inv_jacobian *
@@ -120,7 +164,7 @@ function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::ParallelP4estMesh{3},
+function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -137,7 +181,7 @@ function max_dt(u, t, mesh::ParallelP4estMesh{3},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelP4estMesh{3},
+function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -154,7 +198,7 @@ function max_dt(u, t, mesh::ParallelP4estMesh{3},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelT8codeMesh{3},
+function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -171,7 +215,7 @@ function max_dt(u, t, mesh::ParallelT8codeMesh{3},
     return dt
 end
 
-function max_dt(u, t, mesh::ParallelT8codeMesh{3},
+function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index e3e01d42171..2be73e5e208 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -240,7 +240,7 @@ function dt_polydeg_scaling(dg::DGMulti{3, <:Wedge, <:TensorProductWedge})
 end
 
 # for the stepsize callback
-function max_dt(u, t, mesh::DGMultiMesh,
+function max_dt(backend, u, t, mesh::DGMultiMesh,
                 constant_speed::False, equations, dg::DGMulti{NDIMS},
                 cache) where {NDIMS}
     @unpack md = mesh
@@ -263,7 +263,7 @@ function max_dt(u, t, mesh::DGMultiMesh,
     return 2 * dt_min * dt_polydeg_scaling(dg)
 end
 
-function max_dt(u, t, mesh::DGMultiMesh,
+function max_dt(backend, u, t, mesh::DGMultiMesh,
                 constant_speed::True, equations, dg::DGMulti{NDIMS},
                 cache) where {NDIMS}
     @unpack md = mesh

From fc13ea55f2c2fbde5a361e3d24109bfd49bf5470 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 18 Sep 2025 12:08:08 +0200
Subject: [PATCH 52/81] CPU workaround for analysis callback

---
 src/callbacks_step/analysis_dg2d.jl | 41 ++++++++++++++++++++++++-----
 src/callbacks_step/analysis_dg3d.jl | 40 +++++++++++++++++++++++-----
 src/callbacks_step/save_solution.jl |  9 ++++++-
 3 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index fa18c5af63a..0c4b1bc0b22 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                       UnstructuredMesh2D,
                                       P4estMesh{2}, P4estMeshView{2},
@@ -146,9 +146,19 @@ function calc_error_norms(func, u, t, analyzer,
                           equations,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
 
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -210,13 +220,23 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                            UnstructuredMesh2D, P4estMesh{2},
                                            T8codeMesh{2}},
                                equations,
                                dg::DGSEM, cache, args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, equations, dg, args...))
@@ -226,7 +246,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
             integral += volume_jacobian * weights[i] * weights[j] *
                         func(u, i, j, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j]
@@ -271,10 +291,19 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                              UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
                  equations, dg::DG, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
+    # Calculate
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, element, equations, dg, du
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 072ffc16096..d9bd08a868d 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -161,14 +161,24 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                           equations, initial_condition,
                           dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis
 
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -234,12 +244,22 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...))
@@ -249,7 +269,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, k, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element]))
             integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
                         func(u, i, j, k, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
@@ -295,10 +315,18 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                              T8codeMesh{3}},
                  equations, dg::DG, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, k, element, equations, dg, du
diff --git a/src/callbacks_step/save_solution.jl b/src/callbacks_step/save_solution.jl
index ac40bc42de0..71196d6fe1f 100644
--- a/src/callbacks_step/save_solution.jl
+++ b/src/callbacks_step/save_solution.jl
@@ -280,11 +280,18 @@ end
     return nothing
 end
 
-@inline function save_solution_file(u_ode, t, dt, iter,
+@inline function save_solution_file(_u_ode, t, dt, iter,
                                     semi::AbstractSemidiscretization, solution_callback,
                                     element_variables = Dict{Symbol, Any}(),
                                     node_variables = Dict{Symbol, Any}();
                                     system = "")
+    # TODO GPU currently on CPU
+    backend = trixi_backend(_u_ode)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        u_ode = _u_ode
+    else
+        u_ode = Array(_u_ode)
+    end
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array_native(u_ode, mesh, equations, solver, cache)
     save_solution_file(u, t, dt, iter, mesh, equations, solver, cache,

From 2ff2f529b4f7db08828aab475e20e9080896408e Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 18 Sep 2025 12:09:17 +0200
Subject: [PATCH 53/81] tests

---
 .../elixir_advection_basic_gpu.jl             |  5 +-
 .../elixir_advection_basic_gpu.jl             | 60 +++++++++++++++
 test/runtests.jl                              |  3 +-
 test/{test_cuda.jl => test_cuda_2d.jl}        |  7 +-
 test/test_cuda_3d.jl                          | 73 +++++++++++++++++++
 5 files changed, 142 insertions(+), 6 deletions(-)
 create mode 100644 examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
 rename test/{test_cuda.jl => test_cuda_2d.jl} (98%)
 create mode 100644 test/test_cuda_3d.jl

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 6f9e8e56986..ac3934eca7a 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -48,9 +48,8 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, stepsize_callback)
-# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
-# analysis_callback, save_solution, stepsize_callback)
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation
diff --git a/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
new file mode 100644
index 00000000000..801ae4cb6bc
--- /dev/null
+++ b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,60 @@
+# The same setup as tree_3d_dgsem/elixir_advection_basic.jl
+# to verify the P4estMesh implementation against TreeMesh
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0, -1.0) # minimum coordinates (min(x), min(y), min(z))
+coordinates_max = (1.0, 1.0, 1.0) # maximum coordinates (max(x), max(y), max(z))
+
+# Create P4estMesh with 8 x 8 x 8 elements (note `refinement_level=1`)
+trees_per_dimension = (4, 4, 4)
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.2)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 0.05, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/test/runtests.jl b/test/runtests.jl
index 8f35e1fb58d..df348546130 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -113,7 +113,8 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
         import CUDA
         if CUDA.functional()
-            include("test_cuda.jl")
+            include("test_cuda_2d.jl")
+            include("test_cuda_3d.jl")
         else
             @warn "Unable to run CUDA tests on this machine"
         end
diff --git a/test/test_cuda.jl b/test/test_cuda_2d.jl
similarity index 98%
rename from test/test_cuda.jl
rename to test/test_cuda_2d.jl
index 4380ab0e111..da628f890cb 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda_2d.jl
@@ -5,11 +5,14 @@ using Trixi
 
 include("test_trixi.jl")
 
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
 # Start with a clean environment: remove Trixi.jl output directory if it exists
 outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
-EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+@testset "CUDA 2D" begin
+#! format: noindent
 
 @trixi_testset "elixir_advection_basic_gpu.jl native" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
@@ -75,5 +78,5 @@ end
 
 # Clean up afterwards: delete Trixi.jl output directory
 @test_nowarn isdir(outdir) && rm(outdir, recursive = true)
-
+end
 end # module
diff --git a/test/test_cuda_3d.jl b/test/test_cuda_3d.jl
new file mode 100644
index 00000000000..f4281e880e4
--- /dev/null
+++ b/test/test_cuda_3d.jl
@@ -0,0 +1,73 @@
+module TestCUDA
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+@testset "CUDA 3D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[0.00016263963870641478],
+                        linf=[0.0014537194925779984])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin
+    # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using CUDA
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(0.00016263963870641478)],
+                        linf=[Float32(0.0014537194925779984)],
+                        RealT=Float32,
+                        real_type=Float32,
+                        storage_type=CuArray)
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa CuArray
+    @test ode.p.solver.basis.derivative_matrix isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === CuArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+end
+end # module

From bc4ad17b482ed85976397043031d5cc9f7fec739 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Fri, 19 Sep 2025 09:38:22 +0200
Subject: [PATCH 54/81] add benchmark

---
 benchmark/CUDA/Project.toml                   |  6 ++
 .../CUDA/elixir_euler_taylor_green_vortex.jl  | 82 +++++++++++++++++++
 benchmark/CUDA/run.jl                         | 78 ++++++++++++++++++
 3 files changed, 166 insertions(+)
 create mode 100644 benchmark/CUDA/Project.toml
 create mode 100644 benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
 create mode 100644 benchmark/CUDA/run.jl

diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml
new file mode 100644
index 00000000000..221c03a5947
--- /dev/null
+++ b/benchmark/CUDA/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
new file mode 100644
index 00000000000..2b4275afc86
--- /dev/null
+++ b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -0,0 +1,82 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A  = 1.0 # magnitude of speed
+    Ms = 0.1 # maximum Mach number
+
+    rho = 1.0
+    v1  =  A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2  = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3  = 0.0
+    p   = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p   = p + 1.0/16.0 * A^2 * rho * (cos(2*x[1])*cos(2*x[3]) +
+          2*cos(2*x[2]) + 2*cos(2*x[1]) + cos(2*x[2])*cos(2*x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+# TODO Undefined external symbol "log"
+#volume_flux = flux_ranocha
+volume_flux = flux_lax_friedrichs
+solver = DGSEM(polydeg=5, surface_flux=volume_flux)
+# TODO flux diff
+               #volume_integral=VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = ( 1.0,  1.0,  1.0) .* pi
+
+initial_refinement_level = 1
+trees_per_dimension = (4, 4, 4)
+
+mesh = P4estMesh(trees_per_dimension, polydeg=1,
+                 coordinates_min=coordinates_min, coordinates_max=coordinates_max,
+                 periodicity=true, initial_refinement_level=initial_refinement_level)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 100.0)
+ode = semidiscretize(semi, tspan; storage_type=nothing, real_type=nothing)
+
+summary_callback = SummaryCallback()
+
+stepsize_callback = StepsizeCallback(cfl=0.1)
+
+callbacks = CallbackSet(summary_callback,
+                        stepsize_callback)
+
+
+###############################################################################
+# run the simulation
+
+maxiters = 200
+run_profiler = false
+
+# disable warnings when maxiters is reached
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition=false),
+                  dt=1.0,
+                  save_everystep=false, callback=callbacks,
+                  maxiters=maxiters, verbose=false)
+if run_profiler
+    prof_result = CUDA.@profile solve!(integrator)
+    # the internal profiler will return the results to be printed
+    if isa(prof_result, CUDA.Profile.ProfileResults)
+        print(prof_result)
+    end
+else
+    solve!(integrator)
+end
+
+finalize(mesh)
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
new file mode 100644
index 00000000000..cc1b62306f0
--- /dev/null
+++ b/benchmark/CUDA/run.jl
@@ -0,0 +1,78 @@
+using Trixi
+using CUDA
+using TimerOutputs
+using JSON
+
+function main(elixir_path)
+
+    # setup
+    maxiters = 10
+    initial_refinement_level = 3
+    storage_type = CuArray
+    real_type = Float64
+
+    println("Warming up...")
+
+    # start simulation with tiny final time to trigger compilation
+    duration_compile = @elapsed begin
+        trixi_include(elixir_path,
+                      tspan=(0.0, 1e-14),
+                      storage_type=storage_type,
+                      real_type=real_type)
+        trixi_include(elixir_path,
+                      tspan=(0.0, 1e-14),
+                      storage_type=storage_type,
+                      real_type=Float32)
+    end
+
+    println("Finished warm-up in $duration_compile seconds\n")
+    println("Starting simulation...")
+
+    # start the real simulation
+    duration_elixir = @elapsed trixi_include(elixir_path,
+                                             maxiters=maxiters,
+                                             initial_refinement_level=initial_refinement_level,
+                                             storage_type=storage_type,
+                                             real_type=real_type)
+
+    # store metrics (on every rank!)
+    metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
+
+    # read TimerOutputs timings
+    timer = Trixi.timer()
+    metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
+    metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
+
+    # compute performance index
+    nrhscalls = Trixi.ncalls(semi.performance_counter)
+    walltime = 1.0e-9 * take!(semi.performance_counter)
+    metrics["PID"] = walltime * Trixi.mpi_nranks() / (Trixi.ndofsglobal(semi) * nrhscalls)
+
+    # write json file
+    open("metrics.out", "w") do f
+        indent = 2
+        JSON.print(f, metrics, indent)
+    end
+
+    # run profiler
+    println("Running profiler (Float64)...")
+    trixi_include(elixir_path,
+                  maxiters=5,
+                  initial_refinement_level=initial_refinement_level,
+                  storage_type=storage_type,
+                  real_type=Float64,
+                  run_profiler=true)
+
+    println("Running profiler (Float32)...")
+    trixi_include(elixir_path,
+                  maxiters=5,
+                  initial_refinement_level=initial_refinement_level,
+                  storage_type=storage_type,
+                  real_type=Float32,
+                  run_profiler=true)
+end
+
+# hardcoded elixir
+elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
+
+main(elixir_path)

From de06c618980623845f67f913bf248f64599ccf3c Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Fri, 19 Sep 2025 09:38:56 +0200
Subject: [PATCH 55/81] fix max_dt

---
 src/Trixi.jl                        |  2 +-
 src/callbacks_step/stepsize_dg3d.jl | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 9412c33db6f..e0d4f2dc24b 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -59,7 +59,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend, allocate
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index 159dca720d6..c609b0a5fe4 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -86,17 +86,17 @@ end
 @kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, meshT, equations,
                                             dg, contravariant_vectors, inverse_jacobian)
     element = @index(Global)
-    max_scaled_speeds[element] = max_scaled_speed_element(du, meshT,
-                                                          equations,
-                                                          surface_integral, dg,
-                                                          surface_flux_values, element)
+    max_scaled_speeds[element] = max_scaled_speed_element(u, meshT, equations, dg,
+                                                          contravariant_vectors,
+                                                          inverse_jacobian,
+                                                          element)
 end
 
 function max_scaled_speed_element(u,
                                   ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
                                                  T8codeMesh{3}}}, equations, dg,
                                   contravariant_vectors, inverse_jacobian, element)
-    max_lambda1 = max_lambda2 = max_lambda3 = zero(max_scaled_speed)
+    max_lambda1 = max_lambda2 = max_lambda3 = zero(eltype(u))
     for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         u_node = get_node_vars(u, equations, dg, i, j, k, element)
         lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)

From 29298a5a069e806ed21aa91fdb4e71af0081be32 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 25 Sep 2025 21:41:37 +0200
Subject: [PATCH 56/81] profiler output

---
 benchmark/CUDA/Project.toml                     |  6 ------
 .../CUDA/elixir_euler_taylor_green_vortex.jl    |  5 +----
 benchmark/CUDA/run.jl                           | 17 ++++++++++++++---
 3 files changed, 15 insertions(+), 13 deletions(-)
 delete mode 100644 benchmark/CUDA/Project.toml

diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml
deleted file mode 100644
index 221c03a5947..00000000000
--- a/benchmark/CUDA/Project.toml
+++ /dev/null
@@ -1,6 +0,0 @@
-[deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
-TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
index 2b4275afc86..4e9c777fe7c 100644
--- a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
+++ b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -71,12 +71,9 @@ integrator = init(ode, CarpenterKennedy2N54(williamson_condition=false),
                   maxiters=maxiters, verbose=false)
 if run_profiler
     prof_result = CUDA.@profile solve!(integrator)
-    # the internal profiler will return the results to be printed
-    if isa(prof_result, CUDA.Profile.ProfileResults)
-        print(prof_result)
-    end
 else
     solve!(integrator)
+    prof_result = nothing
 end
 
 finalize(mesh)
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
index cc1b62306f0..d42fac4af23 100644
--- a/benchmark/CUDA/run.jl
+++ b/benchmark/CUDA/run.jl
@@ -6,7 +6,7 @@ using JSON
 function main(elixir_path)
 
     # setup
-    maxiters = 10
+    maxiters = 50
     initial_refinement_level = 3
     storage_type = CuArray
     real_type = Float64
@@ -55,21 +55,32 @@ function main(elixir_path)
     end
 
     # run profiler
+    maxiters = 5
+    initial_refinement_level = 2
+
     println("Running profiler (Float64)...")
     trixi_include(elixir_path,
-                  maxiters=5,
+                  maxiters=maxiters,
                   initial_refinement_level=initial_refinement_level,
                   storage_type=storage_type,
                   real_type=Float64,
                   run_profiler=true)
 
+    open("profile_float64.txt", "w") do io
+           show(io, prof_result)
+    end
+
     println("Running profiler (Float32)...")
     trixi_include(elixir_path,
-                  maxiters=5,
+                  maxiters=maxiters,
                   initial_refinement_level=initial_refinement_level,
                   storage_type=storage_type,
                   real_type=Float32,
                   run_profiler=true)
+
+    open("profile_float32.txt", "w") do io
+           show(io, prof_result)
+    end
 end
 
 # hardcoded elixir

From 962a383a520a28eb5ec5392a9f3e3b497babfe98 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Mon, 29 Sep 2025 15:43:02 +0200
Subject: [PATCH 57/81] fmt

---
 .../CUDA/elixir_euler_taylor_green_vortex.jl  | 42 +++++++++---------
 benchmark/CUDA/run.jl                         | 44 +++++++++----------
 src/Trixi.jl                                  |  3 +-
 src/solvers/dgsem_p4est/dg_3d.jl              |  4 +-
 4 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
index 4e9c777fe7c..de491a3761b 100644
--- a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
+++ b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -8,16 +8,18 @@ equations = CompressibleEulerEquations3D(1.4)
 
 function initial_condition_taylor_green_vortex(x, t,
                                                equations::CompressibleEulerEquations3D)
-    A  = 1.0 # magnitude of speed
+    A = 1.0 # magnitude of speed
     Ms = 0.1 # maximum Mach number
 
     rho = 1.0
-    v1  =  A * sin(x[1]) * cos(x[2]) * cos(x[3])
-    v2  = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
-    v3  = 0.0
-    p   = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
-    p   = p + 1.0/16.0 * A^2 * rho * (cos(2*x[1])*cos(2*x[3]) +
-          2*cos(2*x[2]) + 2*cos(2*x[1]) + cos(2*x[2])*cos(2*x[3]))
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+    p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p = p +
+        1.0 / 16.0 * A^2 * rho *
+        (cos(2 * x[1]) * cos(2 * x[3]) +
+         2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
 
     return prim2cons(SVector(rho, v1, v2, v3, p), equations)
 end
@@ -27,37 +29,35 @@ initial_condition = initial_condition_taylor_green_vortex
 # TODO Undefined external symbol "log"
 #volume_flux = flux_ranocha
 volume_flux = flux_lax_friedrichs
-solver = DGSEM(polydeg=5, surface_flux=volume_flux)
+solver = DGSEM(polydeg = 5, surface_flux = volume_flux)
 # TODO flux diff
-               #volume_integral=VolumeIntegralFluxDifferencing(volume_flux))
+#volume_integral=VolumeIntegralFluxDifferencing(volume_flux))
 
 coordinates_min = (-1.0, -1.0, -1.0) .* pi
-coordinates_max = ( 1.0,  1.0,  1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
 
 initial_refinement_level = 1
 trees_per_dimension = (4, 4, 4)
 
-mesh = P4estMesh(trees_per_dimension, polydeg=1,
-                 coordinates_min=coordinates_min, coordinates_max=coordinates_max,
-                 periodicity=true, initial_refinement_level=initial_refinement_level)
+mesh = P4estMesh(trees_per_dimension, polydeg = 1,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true, initial_refinement_level = initial_refinement_level)
 
 semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 
-
 ###############################################################################
 # ODE solvers, callbacks etc.
 
 tspan = (0.0, 100.0)
-ode = semidiscretize(semi, tspan; storage_type=nothing, real_type=nothing)
+ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
 
 summary_callback = SummaryCallback()
 
-stepsize_callback = StepsizeCallback(cfl=0.1)
+stepsize_callback = StepsizeCallback(cfl = 0.1)
 
 callbacks = CallbackSet(summary_callback,
                         stepsize_callback)
 
-
 ###############################################################################
 # run the simulation
 
@@ -65,10 +65,10 @@ maxiters = 200
 run_profiler = false
 
 # disable warnings when maxiters is reached
-integrator = init(ode, CarpenterKennedy2N54(williamson_condition=false),
-                  dt=1.0,
-                  save_everystep=false, callback=callbacks,
-                  maxiters=maxiters, verbose=false)
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
+                  dt = 1.0,
+                  save_everystep = false, callback = callbacks,
+                  maxiters = maxiters, verbose = false)
 if run_profiler
     prof_result = CUDA.@profile solve!(integrator)
 else
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
index d42fac4af23..5b9f318bfdb 100644
--- a/benchmark/CUDA/run.jl
+++ b/benchmark/CUDA/run.jl
@@ -16,13 +16,13 @@ function main(elixir_path)
     # start simulation with tiny final time to trigger compilation
     duration_compile = @elapsed begin
         trixi_include(elixir_path,
-                      tspan=(0.0, 1e-14),
-                      storage_type=storage_type,
-                      real_type=real_type)
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = real_type)
         trixi_include(elixir_path,
-                      tspan=(0.0, 1e-14),
-                      storage_type=storage_type,
-                      real_type=Float32)
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = Float32)
     end
 
     println("Finished warm-up in $duration_compile seconds\n")
@@ -30,10 +30,10 @@ function main(elixir_path)
 
     # start the real simulation
     duration_elixir = @elapsed trixi_include(elixir_path,
-                                             maxiters=maxiters,
-                                             initial_refinement_level=initial_refinement_level,
-                                             storage_type=storage_type,
-                                             real_type=real_type)
+                                             maxiters = maxiters,
+                                             initial_refinement_level = initial_refinement_level,
+                                             storage_type = storage_type,
+                                             real_type = real_type)
 
     # store metrics (on every rank!)
     metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
@@ -60,26 +60,26 @@ function main(elixir_path)
 
     println("Running profiler (Float64)...")
     trixi_include(elixir_path,
-                  maxiters=maxiters,
-                  initial_refinement_level=initial_refinement_level,
-                  storage_type=storage_type,
-                  real_type=Float64,
-                  run_profiler=true)
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float64,
+                  run_profiler = true)
 
     open("profile_float64.txt", "w") do io
-           show(io, prof_result)
+        show(io, prof_result)
     end
 
     println("Running profiler (Float32)...")
     trixi_include(elixir_path,
-                  maxiters=maxiters,
-                  initial_refinement_level=initial_refinement_level,
-                  storage_type=storage_type,
-                  real_type=Float32,
-                  run_profiler=true)
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float32,
+                  run_profiler = true)
 
     open("profile_float32.txt", "w") do io
-           show(io, prof_result)
+        show(io, prof_result)
     end
 end
 
diff --git a/src/Trixi.jl b/src/Trixi.jl
index e94d7fdbe68..289e48c572e 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -59,7 +59,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend, allocate
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
+                          allocate
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 510f4d3c717..8013bb6d8db 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -237,14 +237,14 @@ end
 
 @kernel function calc_interface_flux_KAkernel!(surface_flux_values, meshT,
                                                nonconservative_terms, equations,
-                                               surface_integral, solverT, u_inferface,
+                                               surface_integral, solverT, u_interface,
                                                neighbor_ids, node_indices,
                                                contravariant_vectors, index_range)
     interface = @index(Global)
     calc_interface_flux_interface!(surface_flux_values,
                                    meshT,
                                    nonconservative_terms,
-                                   equations, surface_integral, solverT, u_inferface,
+                                   equations, surface_integral, solverT, u_interface,
                                    neighbor_ids, node_indices, contravariant_vectors,
                                    index_range, interface)
 end

From a60e27d0d6df9beceff5efecfc1ae2cea21fef7b Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Mon, 29 Sep 2025 17:28:07 +0200
Subject: [PATCH 58/81] missed max_dt calls

---
 benchmark/CUDA/run.jl                                         | 2 +-
 src/callbacks_step/stepsize.jl                                | 3 ++-
 src/callbacks_step/stepsize_dg1d.jl                           | 4 ++--
 src/semidiscretization/semidiscretization_euler_gravity.jl    | 3 ++-
 .../paired_explicit_runge_kutta.jl                            | 3 ++-
 5 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
index 5b9f318bfdb..70c840722af 100644
--- a/benchmark/CUDA/run.jl
+++ b/benchmark/CUDA/run.jl
@@ -56,7 +56,7 @@ function main(elixir_path)
 
     # run profiler
     maxiters = 5
-    initial_refinement_level = 2
+    initial_refinement_level = 1
 
     println("Running profiler (Float64)...")
     trixi_include(elixir_path,
diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index fd5c4f63ff5..f6f04d09893 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -168,6 +168,7 @@ function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
     equations_parabolic = semi.equations_parabolic
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
     dt_advective = cfl_advective(t) * max_dt(backend, u, t, mesh,
                           have_constant_speed(equations), equations,
@@ -175,7 +176,7 @@ function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
 
     cfl_diff = cfl_diffusive(t)
     if cfl_diff > 0 # Check if diffusive CFL should be considered
-        dt_diffusive = cfl_diff * max_dt(u, t, mesh,
+        dt_diffusive = cfl_diff * max_dt(backend, u, t, mesh,
                               have_constant_diffusivity(equations_parabolic), equations,
                               equations_parabolic, solver, cache)
 
diff --git a/src/callbacks_step/stepsize_dg1d.jl b/src/callbacks_step/stepsize_dg1d.jl
index c4cd159edfe..e0cac1ce57c 100644
--- a/src/callbacks_step/stepsize_dg1d.jl
+++ b/src/callbacks_step/stepsize_dg1d.jl
@@ -29,7 +29,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend, u, t, mesh::TreeMesh{1},
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -72,7 +72,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend, u, t, mesh::TreeMesh{1},
                 constant_diffusivity::True, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
diff --git a/src/semidiscretization/semidiscretization_euler_gravity.jl b/src/semidiscretization/semidiscretization_euler_gravity.jl
index 0b1efc00aef..c194da63f90 100644
--- a/src/semidiscretization/semidiscretization_euler_gravity.jl
+++ b/src/semidiscretization/semidiscretization_euler_gravity.jl
@@ -306,6 +306,7 @@ function update_gravity!(semi::SemidiscretizationEulerGravity, u_ode)
     u_euler = wrap_array(u_ode, semi_euler)
     u_gravity = wrap_array(cache.u_ode, semi_gravity)
     du_gravity = wrap_array(cache.du_ode, semi_gravity)
+    backend = trixi_backend(u_ode)
 
     # set up main loop
     finalstep = false
@@ -317,7 +318,7 @@ function update_gravity!(semi::SemidiscretizationEulerGravity, u_ode)
     @unpack equations = semi_gravity
     while !finalstep
         dtau = @trixi_timeit timer() "calculate dtau" begin
-            cfl * max_dt(u_gravity, tau, semi_gravity.mesh,
+            cfl * max_dt(backend, u_gravity, tau, semi_gravity.mesh,
                    have_constant_speed(equations), equations,
                    semi_gravity.solver, semi_gravity.cache)
         end
diff --git a/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl b/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl
index 333ebc14983..4e87c9ff35f 100644
--- a/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl
+++ b/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl
@@ -57,8 +57,9 @@ function calculate_cfl(ode_algorithm::AbstractPairedExplicitRK, ode)
 
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    cfl_number = dt_opt / max_dt(u, t0, mesh,
+    cfl_number = dt_opt / max_dt(backend, u, t0, mesh,
                         have_constant_speed(equations), equations,
                         solver, cache)
     return cfl_number

From 2073d7cd7d135fd00511c386be06ecea7d76638c Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 30 Sep 2025 10:21:25 +0200
Subject: [PATCH 59/81] some fixes

---
 .../semidiscretization_hyperbolic_parabolic.jl              | 3 ++-
 src/solvers/dgsem_tree/dg_3d.jl                             | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index 54ede387fa2..e020903df2c 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -330,10 +330,11 @@ function rhs!(du_ode, u_ode, semi::SemidiscretizationHyperbolicParabolic, t)
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
     du = wrap_array(du_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
     # TODO: Taal decide, do we need to pass the mesh?
     time_start = time_ns()
-    @trixi_timeit timer() "rhs!" rhs!(du, u, t, mesh, equations,
+    @trixi_timeit timer() "rhs!" rhs!(backend, du, u, t, mesh, equations,
                                       boundary_conditions, source_terms, solver, cache)
     runtime = time_ns() - time_start
     put!(semi.performance_counter.counters[1], runtime)
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 914018ce8b4..5a651ec38ba 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -307,7 +307,7 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                have_nonconservative_terms, equations,
@@ -427,7 +427,7 @@ end
 end
 
 # TODO: Taal dimension agnostic
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                have_nonconservative_terms, equations,
@@ -468,7 +468,7 @@ function calc_volume_integral!(du, u,
 end
 
 # TODO: Taal dimension agnostic
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                have_nonconservative_terms, equations,

From 9a2f130c41aaab95f0a2c33b8793014d1ba455c3 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 30 Sep 2025 16:33:21 +0200
Subject: [PATCH 60/81] after merge fixes

---
 src/solvers/dgsem_p4est/dg_3d.jl |  2 +-
 src/solvers/dgsem_tree/dg_3d.jl  | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 1713f0693a9..39a8a24de65 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -299,7 +299,7 @@ function calc_interface_flux_interface!(surface_flux_values,
                                                     i_primary, j_primary, k_primary,
                                                     primary_element)
 
-            calc_interface_flux!(surface_flux_values, meshT, nonconservative_terms,
+            calc_interface_flux!(surface_flux_values, meshT, have_nonconservative_terms,
                                  equations,
                                  surface_integral, solverT, u_interface,
                                  interface, normal_direction,
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 5a651ec38ba..62f7ee7f78c 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -228,8 +228,21 @@ function rhs!(backend, du, u, t,
     return nothing
 end
 
+function calc_volume_integral!(backend, du, u,
+                               mesh::TreeMesh{3},
+                               have_nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralWeakForm,
+                               dg::DGSEM, cache)
+    @threaded for element in eachelement(dg, cache)
+        weak_form_kernel!(du, u, element, mesh,
+                          have_nonconservative_terms, equations,
+                          dg, cache)
+    end
+    return nothing
+end
+
 function calc_volume_integral!(backend::Nothing, du, u,
-                               mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
+                               mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralWeakForm,
@@ -244,7 +257,7 @@ function calc_volume_integral!(backend::Nothing, du, u,
 end
 
 function calc_volume_integral!(backend::Backend, du, u,
-                               mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
+                               mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralWeakForm,
@@ -652,7 +665,7 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{3}, equations, dg::DG)
+function prolong2interfaces!(backend, cache, u, mesh::TreeMesh{3}, equations, dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u

From 9a47f292056c934d6b11239ab7b28e31c6689ec2 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 30 Sep 2025 21:37:57 +0200
Subject: [PATCH 61/81] some more fixes

---
 src/solvers/dgsem_tree/dg_3d.jl           | 4 ++--
 src/solvers/dgsem_tree/dg_3d_parabolic.jl | 2 +-
 src/solvers/fdsbp_tree/fdsbp_3d.jl        | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 62f7ee7f78c..e7795260c6f 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -703,7 +703,7 @@ function prolong2interfaces!(backend, cache, u, mesh::TreeMesh{3}, equations, dg
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -738,7 +738,7 @@ function calc_interface_flux!(surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
diff --git a/src/solvers/dgsem_tree/dg_3d_parabolic.jl b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
index a39d704199d..ee614b873db 100644
--- a/src/solvers/dgsem_tree/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
@@ -974,7 +974,7 @@ function calc_gradient!(gradients, u_transformed, t,
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache_parabolic, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache_parabolic, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
diff --git a/src/solvers/fdsbp_tree/fdsbp_3d.jl b/src/solvers/fdsbp_tree/fdsbp_3d.jl
index 8d220217216..b89dc3bee93 100644
--- a/src/solvers/fdsbp_tree/fdsbp_3d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_3d.jl
@@ -40,7 +40,7 @@ function create_cache(mesh::TreeMesh{3}, equations,
 end
 
 # 3D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend, du, u,
                                mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
@@ -103,7 +103,7 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend, du, u,
                                mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,

From 6ffb69fed093819ba7952805a772c0f7d54f97bf Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 1 Oct 2025 14:11:43 +0200
Subject: [PATCH 62/81] post merge fixes

---
 .../dgsem_tree/dg_2d_subcell_limiters.jl      |  2 +-
 src/solvers/dgsem_tree/dg_3d.jl               | 80 -------------------
 src/solvers/fdsbp_tree/fdsbp_1d.jl            |  4 +-
 src/solvers/fdsbp_tree/fdsbp_2d.jl            |  4 +-
 src/solvers/fdsbp_tree/fdsbp_3d.jl            |  4 +-
 src/solvers/fdsbp_unstructured/fdsbp_2d.jl    |  4 +-
 6 files changed, 9 insertions(+), 89 deletions(-)

diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
index f87fcbdcd32..bb1126c02f9 100644
--- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
@@ -60,7 +60,7 @@ function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, P4estMesh{2}},
 end
 
 # Subcell limiting currently only implemented for certain mesh types
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                            P4estMesh{2}},
                                have_nonconservative_terms, equations,
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 5abbfc7349b..6ae047d519c 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -233,24 +233,6 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
-<<<<<<< HEAD
-function calc_volume_integral!(backend::Nothing, du, u,
-                               mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                           T8codeMesh{3}},
-                               have_nonconservative_terms, equations,
-                               volume_integral::VolumeIntegralFluxDifferencing,
-                               dg::DGSEM, cache)
-    @threaded for element in eachelement(dg, cache)
-        flux_differencing_kernel!(du, u, element, mesh,
-                                  have_nonconservative_terms, equations,
-                                  volume_integral.volume_flux, dg, cache)
-    end
-
-    return nothing
-end
-
-=======
->>>>>>> main
 @inline function flux_differencing_kernel!(du, u,
                                            element, mesh::TreeMesh{3},
                                            have_nonconservative_terms::False, equations,
@@ -355,68 +337,6 @@ end
     return nothing
 end
 
-<<<<<<< HEAD
-# TODO: Taal dimension agnostic
-function calc_volume_integral!(backend::Nothing, du, u,
-                               mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                           T8codeMesh{3}},
-                               have_nonconservative_terms, equations,
-                               volume_integral::VolumeIntegralShockCapturingHG,
-                               dg::DGSEM, cache)
-    @unpack volume_flux_dg, volume_flux_fv, indicator = volume_integral
-
-    # Calculate blending factors α: u = u_DG * (1 - α) + u_FV * α
-    alpha = @trixi_timeit timer() "blending factors" indicator(u, mesh, equations, dg,
-                                                               cache)
-
-    # For `Float64`, this gives 1.8189894035458565e-12
-    # For `Float32`, this gives 1.1920929f-5
-    RealT = eltype(alpha)
-    atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    @threaded for element in eachelement(dg, cache)
-        alpha_element = alpha[element]
-        # Clip blending factor for values close to zero (-> pure DG)
-        dg_only = isapprox(alpha_element, 0, atol = atol)
-
-        if dg_only
-            flux_differencing_kernel!(du, u, element, mesh,
-                                      have_nonconservative_terms, equations,
-                                      volume_flux_dg, dg, cache)
-        else
-            # Calculate DG volume integral contribution
-            flux_differencing_kernel!(du, u, element, mesh,
-                                      have_nonconservative_terms, equations,
-                                      volume_flux_dg, dg, cache, 1 - alpha_element)
-
-            # Calculate FV volume integral contribution
-            fv_kernel!(du, u, mesh, have_nonconservative_terms, equations,
-                       volume_flux_fv, dg, cache, element, alpha_element)
-        end
-    end
-
-    return nothing
-end
-
-# TODO: Taal dimension agnostic
-function calc_volume_integral!(backend::Nothing, du, u,
-                               mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                           T8codeMesh{3}},
-                               have_nonconservative_terms, equations,
-                               volume_integral::VolumeIntegralPureLGLFiniteVolume,
-                               dg::DGSEM, cache)
-    @unpack volume_flux_fv = volume_integral
-
-    # Calculate LGL FV volume integral
-    @threaded for element in eachelement(dg, cache)
-        fv_kernel!(du, u, mesh, have_nonconservative_terms, equations, volume_flux_fv,
-                   dg, cache, element, true)
-    end
-
-    return nothing
-end
-
-=======
->>>>>>> main
 @inline function fv_kernel!(du, u,
                             mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                         T8codeMesh{3}},
diff --git a/src/solvers/fdsbp_tree/fdsbp_1d.jl b/src/solvers/fdsbp_tree/fdsbp_1d.jl
index 051e488d08c..6e71d7627d9 100644
--- a/src/solvers/fdsbp_tree/fdsbp_1d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_1d.jl
@@ -40,7 +40,7 @@ function create_cache(mesh::TreeMesh{1}, equations,
 end
 
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::TreeMesh{1},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
@@ -87,7 +87,7 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::TreeMesh{1},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
diff --git a/src/solvers/fdsbp_tree/fdsbp_2d.jl b/src/solvers/fdsbp_tree/fdsbp_2d.jl
index db3130e6ed3..6f642ef1ab6 100644
--- a/src/solvers/fdsbp_tree/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_2d.jl
@@ -40,7 +40,7 @@ function create_cache(mesh::Union{TreeMesh{2}, UnstructuredMesh2D}, equations,
 end
 
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::TreeMesh{2},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
@@ -96,7 +96,7 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::TreeMesh{2},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
diff --git a/src/solvers/fdsbp_tree/fdsbp_3d.jl b/src/solvers/fdsbp_tree/fdsbp_3d.jl
index b89dc3bee93..1eff0986e17 100644
--- a/src/solvers/fdsbp_tree/fdsbp_3d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_3d.jl
@@ -40,7 +40,7 @@ function create_cache(mesh::TreeMesh{3}, equations,
 end
 
 # 3D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(backend, du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
@@ -103,7 +103,7 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(backend, du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
diff --git a/src/solvers/fdsbp_unstructured/fdsbp_2d.jl b/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
index ac7e4c36758..5b3bd95b8cd 100644
--- a/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
@@ -28,7 +28,7 @@ end
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
 # OBS! This is the standard (not de-aliased) form of the volume integral.
 # So it is not provably stable for variable coefficients due to the the metric terms.
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::UnstructuredMesh2D,
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
@@ -91,7 +91,7 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::UnstructuredMesh2D,
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,

From 307c3eba667b144223f268c2beaa1f9695681e94 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 1 Oct 2025 16:32:35 +0200
Subject: [PATCH 63/81] more

---
 src/solvers/dgsem/calc_volume_integral.jl | 3 ++-
 src/solvers/dgsem_p4est/dg_3d.jl          | 2 +-
 src/solvers/dgsem_structured/dg_3d.jl     | 6 ++++--
 src/solvers/dgsem_tree/dg_3d.jl           | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/solvers/dgsem/calc_volume_integral.jl b/src/solvers/dgsem/calc_volume_integral.jl
index 7900b967aa6..e0041305e88 100644
--- a/src/solvers/dgsem/calc_volume_integral.jl
+++ b/src/solvers/dgsem/calc_volume_integral.jl
@@ -70,7 +70,8 @@ function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                       volume_flux_dg, dg, cache, 1 - alpha_element)
 
             # Calculate FV volume integral contribution
-            fv_kernel!(du, u, mesh, have_nonconservative_terms, equations, volume_flux_fv,
+            fv_kernel!(du, u, mesh, have_nonconservative_terms, equations,
+                       volume_flux_fv,
                        dg, cache, element, alpha_element)
         end
     end
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 39a8a24de65..ea59ff6a1c6 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -358,7 +358,7 @@ end
 
 # Inlined function for interface flux computation for flux + nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True, equations,
                                       surface_integral, dg::DG, cache,
                                       interface_index, normal_direction,
diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index ab555c481f8..b4421589520 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -39,7 +39,8 @@ function rhs!(backend, du, u, t,
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -80,7 +81,8 @@ function calc_volume_integral!(backend::Backend, du, u,
     return nothing
 end
 
-@kernel function weak_form_KAkernel!(du, u, meshT, have_nonconservative_terms, equations,
+@kernel function weak_form_KAkernel!(du, u, meshT, have_nonconservative_terms,
+                                     equations,
                                      dg::DGSEM, contravariant_vectors)
     element = @index(Global)
     weak_form_kernel_element!(du, u, element, meshT,
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 6ae047d519c..2a510982f6d 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -1318,7 +1318,8 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{3}, StructuredMesh{3}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{3}, StructuredMesh{3}},
                                 equations, surface_integral, dg::DGSEM, cache)
     @unpack boundary_interpolation = dg.basis
     @unpack surface_flux_values = cache.elements

From c39b4de1af51d8cd2f1436e18ece76ea082daaed Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 1 Oct 2025 22:18:29 +0200
Subject: [PATCH 64/81] more

---
 src/solvers/dgsem_p4est/dg_2d.jl          | 2 +-
 src/solvers/dgsem_p4est/dg_3d.jl          | 7 ++++---
 src/solvers/dgsem_structured/dg_3d.jl     | 2 +-
 src/solvers/dgsem_tree/dg_2d.jl           | 4 ++--
 src/solvers/dgsem_tree/dg_2d_parabolic.jl | 2 +-
 src/solvers/dgsem_tree/dg_3d.jl           | 2 +-
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index 2b2f9ff8b72..b417e87a77d 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -647,7 +647,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                             T8codeMesh{2}},
                                 equations,
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index ea59ff6a1c6..6ab4f33e677 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -360,17 +360,18 @@ end
 @inline function calc_interface_flux!(surface_flux_values,
                                       ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True, equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     surface_flux, nonconservative_flux = surface_integral.surface_flux
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index,
                                        primary_j_node_index, interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index b4421589520..64f03d30dca 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -34,7 +34,7 @@ function rhs!(backend, du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 563b4d49e7e..57f7bf81ec6 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -156,7 +156,7 @@ function rhs!(backend, du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
@@ -1021,7 +1021,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                             StructuredMeshView{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
diff --git a/src/solvers/dgsem_tree/dg_2d_parabolic.jl b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
index 232e13de88b..35f259ca9e5 100644
--- a/src/solvers/dgsem_tree/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
@@ -103,7 +103,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache_parabolic)
     end
 
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 2a510982f6d..27a6158c637 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -1371,7 +1371,7 @@ function calc_surface_integral!(backend::Nothing, du, u,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{3},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{3},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 

From a38cc03f1a35c415d212f827694a0e8f68731ef7 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 7 Oct 2025 18:07:27 +0200
Subject: [PATCH 65/81] Squashed commit of the following:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit f4bbcd9ffd18e933fe0b3888d8dbad1b92afd21e
Author: Daniel Doehring <daniel.doehring@rwth-aachen.de>
Date:   Sun Oct 5 08:33:38 2025 +0200

    Comment `temperature` and /3 (#2594)

    ---------

    Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>

commit 68c0c71a20a8eace38dcd224277654eece7f57ca
Author: Daniel Doehring <daniel.doehring@rwth-aachen.de>
Date:   Fri Oct 3 15:06:55 2025 +0200

    Second-Order Finite Volume Integral in 1D (#2022)

    * Pick up where Gregor left

    * preliminary example

    * more limiters

    * comments

    * fmt

    * continue

    * comments

    * print some more info

    * Add unit tests

    * add comment

    * Remove some alternative limiter implementations.

    * move, comments, fmt

    * Use second order timestepping

    * debug superbee

    * prim2cons 1D Adv

    * test

    * fmt, typo

    * typos

    * some more tests

    * fmt

    * Update src/solvers/dgsem_tree/finite_volume_O2.jl

    * Update test/test_unit.jl

    * Update src/solvers/dgsem_tree/dg_1d.jl

    * fmt

    * add different recontruction mode

    * Update src/solvers/dgsem_tree/finite_volume_O2.jl

    Co-authored-by: Andrés Rueda-Ramírez <aruedara@uni-koeln.de>

    * test + fmt

    * comments

    * correct way cells dim

    * increase coverage

    * revisit

    * continue

    * fmt

    * shorten

    * extra test

    * comment "inverse_weights"

    * change files

    * test vals

    * Update test/test_tree_1d_euler.jl

    * Update examples/structured_1d_dgsem/elixir_euler_source_terms_nonperiodic_fvO2.jl

    * Update examples/tree_1d_dgsem/elixir_euler_convergence_pure_fvO2.jl

    * Update test/test_tree_1d_euler.jl

    * fix

    * test compact print

    * comment

    * relabel

    * comments

    * comments

    * comemnts

    * commenbts

    * rm

    * test

    * rename

    * docstrings

    * comments

    * Apply suggestions from code review

    Co-authored-by: Joshua Lampert <51029046+JoshuaLampert@users.noreply.github.com>

    * fmt

    * fmt

    * mv

    * fix

    * Apply suggestions from code review

    Co-authored-by: Joshua Lampert <51029046+JoshuaLampert@users.noreply.github.com>

    ---------

    Co-authored-by: Andrés Rueda-Ramírez <aruedara@uni-koeln.de>
    Co-authored-by: Joshua Lampert <51029046+JoshuaLampert@users.noreply.github.com>

commit 96c7aef8e0c3086901d4fd6ce7594c0902f2bfda
Author: Daniel Doehring <daniel.doehring@rwth-aachen.de>
Date:   Thu Oct 2 18:19:13 2025 +0200

    Bundle identical `rhs!` (#2552)

    * Bundle identical `rhs!`

    * fix 1d

    * comment

    * bring back

    ---------

    Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>

commit 5c978033d273b4a2e4cfc279fe31e2abfff90648
Author: Daniel Doehring <daniel.doehring@rwth-aachen.de>
Date:   Thu Oct 2 15:39:55 2025 +0200

    Use variable name `have_nonconservative_terms` (#2592)

    * Use variable name `have_nonconservative_terms`

    * fix

    * cons fmt

    ---------

    Co-authored-by: Benedict <135045760+benegee@users.noreply.github.com>
    Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>

commit 26886239f1194073a62cbc215846d62c658a8200
Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Date:   Thu Oct 2 08:10:10 2025 +0200

    Bump crate-ci/typos from 1.35.7 to 1.37.1 (#2593)

    * Bump crate-ci/typos from 1.35.7 to 1.37.1

    Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.35.7 to 1.37.1.
    - [Release notes](https://github.com/crate-ci/typos/releases)
    - [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md)
    - [Commits](https://github.com/crate-ci/typos/compare/v1.35.7...v1.37.1)

    ---
    updated-dependencies:
    - dependency-name: crate-ci/typos
      dependency-version: 1.37.1
      dependency-type: direct:production
      update-type: version-update:semver-minor
    ...

    Signed-off-by: dependabot[bot] <support@github.com>

    * fix typos

    ---------

    Signed-off-by: dependabot[bot] <support@github.com>
    Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
    Co-authored-by: Joshua Lampert <joshua.lampert@uni-hamburg.de>
---
 .github/workflows/SpellCheck.yml              |   2 +-
 NEWS.md                                       |   6 +-
 .../elixir_navierstokes_couette_flow.jl       |   5 +-
 .../elixir_navierstokes_poiseuille_flow.jl    |   5 +-
 .../elixir_navierstokes_viscous_shock.jl      |  12 +-
 ...avierstokes_viscous_shock_newton_krylov.jl |  12 +-
 .../elixir_navierstokes_viscous_shock.jl      |  12 +-
 ...xir_euler_source_terms_nonperiodic_fvO2.jl |  63 +++++
 .../elixir_euler_convergence_pure_fvO2.jl     |  57 ++++
 .../elixir_hypdiff_harmonic_nonperiodic.jl    |   2 +-
 .../elixir_linearizedeuler_gauss_wall.jl      |   2 +-
 .../elixir_navierstokes_convergence_walls.jl  |   8 +-
 ...ixir_navierstokes_convergence_walls_amr.jl |   8 +-
 .../elixir_navierstokes_viscous_shock.jl      |  12 +-
 .../elixir_navierstokes_viscous_shock_imex.jl |  12 +-
 ...erstokes_taylor_green_vortex_sutherland.jl |   2 +-
 src/Trixi.jl                                  |  13 +-
 src/auxiliary/math.jl                         |   5 +
 .../subcell_limiter_idp_correction_2d.jl      |   2 +-
 .../compressible_navier_stokes_1d.jl          |  11 +-
 .../compressible_navier_stokes_2d.jl          |  15 +-
 .../compressible_navier_stokes_3d.jl          |  17 +-
 src/equations/hyperbolic_diffusion_1d.jl      |   2 +-
 src/solvers/dg.jl                             |  87 +++++-
 src/solvers/dgmulti/flux_differencing.jl      |   2 +-
 src/solvers/dgsem/calc_volume_integral.jl     |  15 +-
 src/solvers/dgsem_p4est/dg_2d_parabolic.jl    |   2 +-
 src/solvers/dgsem_structured/dg.jl            |  44 ++++
 src/solvers/dgsem_structured/dg_1d.jl         |  43 +--
 src/solvers/dgsem_structured/dg_2d.jl         |  44 ----
 src/solvers/dgsem_structured/dg_3d.jl         |  45 ----
 src/solvers/dgsem_tree/dg.jl                  |   4 +
 src/solvers/dgsem_tree/dg_1d.jl               | 125 ++++++++-
 src/solvers/dgsem_tree/dg_2d.jl               |   5 +-
 .../dgsem_tree/dg_2d_subcell_limiters.jl      |   2 +-
 src/solvers/dgsem_tree/dg_3d.jl               |  73 +-----
 .../dgsem_tree/subcell_finite_volume_O2.jl    | 247 ++++++++++++++++++
 src/solvers/dgsem_tree/subcell_limiters_2d.jl |   2 +-
 src/solvers/dgsem_unstructured/dg_2d.jl       |   2 +-
 test/test_parabolic_2d.jl                     |  40 +--
 test/test_structured_1d.jl                    |  21 ++
 test/test_tree_1d_euler.jl                    |  38 +++
 test/test_unit.jl                             |  52 ++++
 43 files changed, 862 insertions(+), 316 deletions(-)
 create mode 100644 examples/structured_1d_dgsem/elixir_euler_source_terms_nonperiodic_fvO2.jl
 create mode 100644 examples/tree_1d_dgsem/elixir_euler_convergence_pure_fvO2.jl
 create mode 100644 src/solvers/dgsem_tree/subcell_finite_volume_O2.jl

diff --git a/.github/workflows/SpellCheck.yml b/.github/workflows/SpellCheck.yml
index 172991d9f12..606c4b1add8 100644
--- a/.github/workflows/SpellCheck.yml
+++ b/.github/workflows/SpellCheck.yml
@@ -10,4 +10,4 @@ jobs:
       - name: Checkout Actions Repository
         uses: actions/checkout@v5
       - name: Check spelling
-        uses: crate-ci/typos@v1.35.7
+        uses: crate-ci/typos@v1.37.1
diff --git a/NEWS.md b/NEWS.md
index b87a369b042..0290b08acd5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,12 +10,12 @@ for human readability.
 
 #### Changed
 
-- The `polyester` preference got merged with the `native_threading` preference and the `Trixi.set_polyester!` 
+- The `polyester` preference got merged with the `native_threading` preference and the `Trixi.set_polyester!`
   function got renamed to `Trixi.set_threading_backend!` ([#2476]).
 - Default wave-speed estimate used within `flux_lax_friedrichs` changed from `max_abs_speed_naive` to
   `max_abs_speed` which is less diffusive.
   In v0.13, `flux_lax_friedrichs = FluxLaxFriedrichs(max_abs_speed = max_abs_speed)`
-  instead of the previous default 
+  instead of the previous default
   `FluxLaxFriedrichs(max_abs_speed = max_abs_speed_naive)` ([#2458]).
 - The signature of the `VisualizationCallback` constructor changed.
   In the new version, it is mandatory to pass the semidiscretization `semi` to
@@ -296,7 +296,7 @@ for human readability.
   `(; a, b) = stuff` instead of `@unpack a, b = stuff`.
 - The constructor `DGMultiMesh(dg; cells_per_dimension, kwargs...)` is deprecated
   and will be removed. The new constructor `DGMultiMesh(dg, cells_per_dimension; kwargs...)`
-  does not have `cells_per_dimesion` as a keyword argument.
+  does not have `cells_per_dimension` as a keyword argument.
 
 #### Removed
 
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_couette_flow.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_couette_flow.jl
index 84b56aad1c1..22e866a9bdd 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_couette_flow.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_couette_flow.jl
@@ -67,9 +67,8 @@ bs_hyperbolic = Dict(:x_neg => BoundaryConditionDirichlet(initial_condition), #
 velocity_bc_top_left = NoSlip((x, t, equations) -> SVector(x[2] / height() * v_top(), 0))
 # Use isothermal for inflow - adiabatic should also work
 heat_bc_top_left = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition(x, t,
-                                        equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition(x, t, equations_parabolic),
+                equations_parabolic)
 end
 bc_parabolic_top_left = BoundaryConditionNavierStokesWall(velocity_bc_top_left,
                                                           heat_bc_top_left)
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_poiseuille_flow.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_poiseuille_flow.jl
index fcbcd7d65e6..3ee1f85674a 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_poiseuille_flow.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_poiseuille_flow.jl
@@ -69,9 +69,8 @@ bs_hyperbolic = Dict(:x_neg => BoundaryConditionDirichlet(initial_condition), #
 velocity_bc_inflow = NoSlip((x, t, equations) -> SVector(v_in, 0))
 # Use isothermal for inflow - adiabatic should also work
 heat_bc_inflow = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition(x, t,
-                                        equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition(x, t, equations_parabolic),
+                equations_parabolic)
 end
 bc_parabolic_inflow = BoundaryConditionNavierStokesWall(velocity_bc_inflow, heat_bc_inflow)
 
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock.jl
index e0085091369..af1f04b7349 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock.jl
@@ -129,17 +129,13 @@ boundary_conditions = Dict(:x_neg => boundary_condition_inflow,
 ### Viscous boundary conditions ###
 # For the viscous BCs, we use the known analytical solution
 velocity_bc = NoSlip() do x, t, equations_parabolic
-    Trixi.velocity(initial_condition_viscous_shock(x,
-                                                   t,
-                                                   equations_parabolic),
-                   equations_parabolic)
+    velocity(initial_condition_viscous_shock(x, t, equations_parabolic),
+             equations_parabolic)
 end
 
 heat_bc = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_viscous_shock(x,
-                                                      t,
-                                                      equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_viscous_shock(x, t, equations_parabolic),
+                equations_parabolic)
 end
 
 boundary_condition_parabolic = BoundaryConditionNavierStokesWall(velocity_bc, heat_bc)
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock_newton_krylov.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock_newton_krylov.jl
index 142289aaace..5080de3ee56 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock_newton_krylov.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_viscous_shock_newton_krylov.jl
@@ -124,17 +124,13 @@ boundary_conditions = Dict(:x_neg => boundary_condition_inflow,
 ### Viscous boundary conditions ###
 # For the viscous BCs, we use the known analytical solution
 velocity_bc = NoSlip() do x, t, equations_parabolic
-    Trixi.velocity(initial_condition_viscous_shock(x,
-                                                   t,
-                                                   equations_parabolic),
-                   equations_parabolic)
+    velocity(initial_condition_viscous_shock(x, t, equations_parabolic),
+             equations_parabolic)
 end
 
 heat_bc = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_viscous_shock(x,
-                                                      t,
-                                                      equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_viscous_shock(x, t, equations_parabolic),
+                equations_parabolic)
 end
 
 boundary_condition_parabolic = BoundaryConditionNavierStokesWall(velocity_bc, heat_bc)
diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_viscous_shock.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_viscous_shock.jl
index 23abd9d1618..e048e4798e6 100644
--- a/examples/p4est_3d_dgsem/elixir_navierstokes_viscous_shock.jl
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_viscous_shock.jl
@@ -129,17 +129,13 @@ boundary_conditions = Dict(:x_neg => boundary_condition_inflow,
 ### Viscous boundary conditions ###
 # For the viscous BCs, we use the known analytical solution
 velocity_bc = NoSlip() do x, t, equations_parabolic
-    Trixi.velocity(initial_condition_viscous_shock(x,
-                                                   t,
-                                                   equations_parabolic),
-                   equations_parabolic)
+    velocity(initial_condition_viscous_shock(x, t, equations_parabolic),
+             equations_parabolic)
 end
 
 heat_bc = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_viscous_shock(x,
-                                                      t,
-                                                      equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_viscous_shock(x, t, equations_parabolic),
+                equations_parabolic)
 end
 
 boundary_condition_parabolic = BoundaryConditionNavierStokesWall(velocity_bc, heat_bc)
diff --git a/examples/structured_1d_dgsem/elixir_euler_source_terms_nonperiodic_fvO2.jl b/examples/structured_1d_dgsem/elixir_euler_source_terms_nonperiodic_fvO2.jl
new file mode 100644
index 00000000000..392a371f38c
--- /dev/null
+++ b/examples/structured_1d_dgsem/elixir_euler_source_terms_nonperiodic_fvO2.jl
@@ -0,0 +1,63 @@
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations1D(1.4)
+
+initial_condition = initial_condition_convergence_test
+source_terms = source_terms_convergence_test
+
+# you can either use a single function to impose the BCs weakly in all
+# 2*ndims == 2 directions or you can pass a tuple containing BCs for
+# each direction
+boundary_condition = BoundaryConditionDirichlet(initial_condition)
+boundary_conditions = (x_neg = boundary_condition,
+                       x_pos = boundary_condition)
+
+polydeg = 8 # Governs in this case only the number of subcells
+basis = LobattoLegendreBasis(polydeg)
+surface_flux = flux_hll
+volume_integral = VolumeIntegralPureLGLFiniteVolumeO2(basis, surface_flux,
+                                                      reconstruction_mode = reconstruction_O2_inner,
+                                                      slope_limiter = vanLeer)
+solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux,
+               volume_integral = volume_integral)
+
+coordinates_min = (0.0,)
+coordinates_max = (2.0,)
+cells_per_dimension = (8,)
+mesh = StructuredMesh(cells_per_dimension, coordinates_min, coordinates_max,
+                      periodicity = false)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms,
+                                    boundary_conditions = boundary_conditions)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 2.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+stepsize_callback = StepsizeCallback(cfl = 1.1)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, ParsaniKetchesonDeconinck3S82(),
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
diff --git a/examples/tree_1d_dgsem/elixir_euler_convergence_pure_fvO2.jl b/examples/tree_1d_dgsem/elixir_euler_convergence_pure_fvO2.jl
new file mode 100644
index 00000000000..0021569442f
--- /dev/null
+++ b/examples/tree_1d_dgsem/elixir_euler_convergence_pure_fvO2.jl
@@ -0,0 +1,57 @@
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations1D(1.4)
+
+initial_condition = initial_condition_convergence_test
+
+polydeg = 3 # Governs in this case only the number of subcells
+basis = LobattoLegendreBasis(polydeg)
+surface_flux = flux_hllc
+volume_integral = VolumeIntegralPureLGLFiniteVolumeO2(basis, surface_flux,
+                                                      reconstruction_mode = reconstruction_O2_full,
+                                                      slope_limiter = monotonized_central)
+solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux,
+               volume_integral = volume_integral)
+
+coordinates_min = 0.0
+coordinates_max = 2.0
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 4,
+                n_cells_max = 10_000)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms_convergence_test)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 2.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     extra_analysis_errors = (:l2_error_primitive,
+                                                              :linf_error_primitive,
+                                                              :conservation_error))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+stepsize_callback = StepsizeCallback(cfl = 1.1)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, ORK256(),
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
diff --git a/examples/tree_1d_dgsem/elixir_hypdiff_harmonic_nonperiodic.jl b/examples/tree_1d_dgsem/elixir_hypdiff_harmonic_nonperiodic.jl
index 52653c0f923..ae6a9e28b80 100644
--- a/examples/tree_1d_dgsem/elixir_hypdiff_harmonic_nonperiodic.jl
+++ b/examples/tree_1d_dgsem/elixir_hypdiff_harmonic_nonperiodic.jl
@@ -8,7 +8,7 @@ equations = HyperbolicDiffusionEquations1D(nu = 1.25)
 """
     initial_condition_poisson_nonperiodic(x, t, equations::HyperbolicDiffusionEquations1D)
 
-A non-priodic harmonic function used in combination with
+A non-periodic harmonic function used in combination with
 [`source_terms_poisson_nonperiodic`](@ref) and [`boundary_condition_poisson_nonperiodic`](@ref).
 
 !!! note
diff --git a/examples/tree_1d_dgsem/elixir_linearizedeuler_gauss_wall.jl b/examples/tree_1d_dgsem/elixir_linearizedeuler_gauss_wall.jl
index 4880c6ae623..a7844b5ce0a 100644
--- a/examples/tree_1d_dgsem/elixir_linearizedeuler_gauss_wall.jl
+++ b/examples/tree_1d_dgsem/elixir_linearizedeuler_gauss_wall.jl
@@ -19,7 +19,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 
 # Initialize density and pressure perturbation with a Gaussian bump
 # that is advected to left with v - c and to the right with v + c.
-# Correspondigly, the bump splits in half.
+# Correspondingly, the bump splits in half.
 function initial_condition_gauss_wall(x, t, equations::LinearizedEulerEquations1D)
     v1_prime = 0
     rho_prime = p_prime = 2 * exp(-(x[1] - 45)^2 / 25)
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl
index 2b9979db443..2f7e078d3fb 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl
@@ -135,10 +135,10 @@ velocity_bc_left_right = NoSlip() do x, t, equations_parabolic
 end
 
 heat_bc_left = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_navier_stokes_convergence_test(x,
-                                                                       t,
-                                                                       equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_navier_stokes_convergence_test(x,
+                                                                 t,
+                                                                 equations_parabolic),
+                equations_parabolic)
 end
 
 heat_bc_right = Adiabatic((x, t, equations_parabolic) -> 0.0)
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl
index cb7b4310b6e..d06f0b85e07 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl
@@ -135,10 +135,10 @@ velocity_bc_left_right = NoSlip() do x, t, equations_parabolic
 end
 
 heat_bc_left = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_navier_stokes_convergence_test(x,
-                                                                       t,
-                                                                       equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_navier_stokes_convergence_test(x,
+                                                                 t,
+                                                                 equations_parabolic),
+                equations_parabolic)
 end
 
 heat_bc_right = Adiabatic((x, t, equations_parabolic) -> 0.0)
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl
index 80597cab362..ad2e7ef7040 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl
@@ -123,17 +123,13 @@ boundary_conditions = (; x_neg = boundary_condition_inflow,
 ### Viscous boundary conditions ###
 # For the viscous BCs, we use the known analytical solution
 velocity_bc = NoSlip() do x, t, equations_parabolic
-    Trixi.velocity(initial_condition_viscous_shock(x,
-                                                   t,
-                                                   equations_parabolic),
-                   equations_parabolic)
+    velocity(initial_condition_viscous_shock(x, t, equations_parabolic),
+             equations_parabolic)
 end
 
 heat_bc = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_viscous_shock(x,
-                                                      t,
-                                                      equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_viscous_shock(x, t, equations_parabolic),
+                equations_parabolic)
 end
 
 boundary_condition_parabolic = BoundaryConditionNavierStokesWall(velocity_bc, heat_bc)
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl
index 18f1df5bd28..fe29e9feb9e 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl
@@ -117,17 +117,13 @@ boundary_conditions = (; x_neg = boundary_condition_inflow,
 ### Viscous boundary conditions ###
 # For the viscous BCs, we use the known analytical solution
 velocity_bc = NoSlip() do x, t, equations_parabolic
-    Trixi.velocity(initial_condition_viscous_shock(x,
-                                                   t,
-                                                   equations_parabolic),
-                   equations_parabolic)
+    velocity(initial_condition_viscous_shock(x, t, equations_parabolic),
+             equations_parabolic)
 end
 
 heat_bc = Isothermal() do x, t, equations_parabolic
-    Trixi.temperature(initial_condition_viscous_shock(x,
-                                                      t,
-                                                      equations_parabolic),
-                      equations_parabolic)
+    temperature(initial_condition_viscous_shock(x, t, equations_parabolic),
+                equations_parabolic)
 end
 
 boundary_condition_parabolic = BoundaryConditionNavierStokesWall(velocity_bc, heat_bc)
diff --git a/examples/tree_2d_dgsem/elixir_navierstokes_taylor_green_vortex_sutherland.jl b/examples/tree_2d_dgsem/elixir_navierstokes_taylor_green_vortex_sutherland.jl
index 3beade2b09a..df16dca0302 100644
--- a/examples/tree_2d_dgsem/elixir_navierstokes_taylor_green_vortex_sutherland.jl
+++ b/examples/tree_2d_dgsem/elixir_navierstokes_taylor_green_vortex_sutherland.jl
@@ -16,7 +16,7 @@ prandtl_number() = 0.72
     T_ref = convert(RealT, 291.15)
 
     R_specific_air = convert(RealT, 287.052874)
-    T = R_specific_air * Trixi.temperature(u, equations)
+    T = R_specific_air * temperature(u, equations)
 
     C_air = 120
     mu_ref_air = convert(RealT, 1.827e-5)
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 289e48c572e..8192520696d 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -243,8 +243,10 @@ export initial_condition_eoc_test_coupled_euler_gravity,
 
 export cons2cons, cons2prim, prim2cons, cons2macroscopic, cons2state, cons2mean,
        cons2entropy, entropy2cons
-export density, pressure, density_pressure, velocity, global_mean_vars,
-       equilibrium_distribution, waterheight, waterheight_pressure
+export density, pressure, density_pressure, velocity, temperature,
+       global_mean_vars,
+       equilibrium_distribution,
+       waterheight, waterheight_pressure
 export entropy, energy_total, energy_kinetic, energy_internal,
        energy_magnetic, cross_helicity, magnetic_field, divergence_cleaning_field,
        enstrophy, vorticity
@@ -259,13 +261,18 @@ export DG,
        FDSBP,
        VolumeIntegralWeakForm, VolumeIntegralStrongForm,
        VolumeIntegralFluxDifferencing,
-       VolumeIntegralPureLGLFiniteVolume,
+       VolumeIntegralPureLGLFiniteVolume, VolumeIntegralPureLGLFiniteVolumeO2,
        VolumeIntegralShockCapturingHG, IndicatorHennemannGassner,
        VolumeIntegralUpwind,
        SurfaceIntegralWeakForm, SurfaceIntegralStrongForm,
        SurfaceIntegralUpwind,
        MortarL2
 
+export reconstruction_O2_inner, reconstruction_O2_full,
+       reconstruction_constant,
+       minmod, monotonized_central, superbee, vanLeer,
+       central_slope
+
 export VolumeIntegralSubcellLimiting, BoundsCheckCallback,
        SubcellLimiterIDP, SubcellLimiterIDPCorrection
 
diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index e2fcab85fa0..2ef360c6e96 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -434,4 +434,9 @@ Given ε = 1.0e-4, we use the following algorithm.
                (y^(gamma - 1) - x^(gamma - 1))
     end
 end
+
+# Note: This is not a limiter, instead a helper for the `superbee` limiter.
+@inline function maxmod(sl, sr)
+    return 0.5f0 * (sign(sl) + sign(sr)) * max(abs(sl), abs(sr))
+end
 end # @muladd
diff --git a/src/callbacks_stage/subcell_limiter_idp_correction_2d.jl b/src/callbacks_stage/subcell_limiter_idp_correction_2d.jl
index 337b62a8fb1..4caaff8fc17 100644
--- a/src/callbacks_stage/subcell_limiter_idp_correction_2d.jl
+++ b/src/callbacks_stage/subcell_limiter_idp_correction_2d.jl
@@ -9,7 +9,7 @@ function perform_idp_correction!(u, dt,
                                  mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                              P4estMesh{2}},
                                  equations, dg, cache)
-    @unpack inverse_weights = dg.basis
+    @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R = cache.antidiffusive_fluxes
     @unpack alpha = dg.volume_integral.limiter.cache.subcell_limiter_coefficients
 
diff --git a/src/equations/compressible_navier_stokes_1d.jl b/src/equations/compressible_navier_stokes_1d.jl
index 8d66b0d077f..07ca7df987b 100644
--- a/src/equations/compressible_navier_stokes_1d.jl
+++ b/src/equations/compressible_navier_stokes_1d.jl
@@ -280,11 +280,20 @@ end
     prim2cons(u, equations.equations_hyperbolic)
 end
 
+"""
+    temperature(u, equations::CompressibleNavierStokesDiffusion1D)
+
+Compute the temperature from the conservative variables `u`.
+In particular, this assumes a specific gas constant ``R = 1``:
+```math
+T = \\frac{p}{\\rho}
+```
+"""
 @inline function temperature(u, equations::CompressibleNavierStokesDiffusion1D)
     rho, rho_v1, rho_e = u
 
     p = (equations.gamma - 1) * (rho_e - 0.5f0 * rho_v1^2 / rho)
-    T = p / rho
+    T = p / rho # Corresponds to a specific gas constant R = 1
     return T
 end
 
diff --git a/src/equations/compressible_navier_stokes_2d.jl b/src/equations/compressible_navier_stokes_2d.jl
index c3ad64143fd..96f00c866e7 100644
--- a/src/equations/compressible_navier_stokes_2d.jl
+++ b/src/equations/compressible_navier_stokes_2d.jl
@@ -159,12 +159,12 @@ function flux(u, gradients, orientation::Integer,
     # Components of viscous stress tensor
 
     # (4 * (v1)_x / 3 - 2 * (v2)_y / 3)
-    tau_11 = 4 * dv1dx / 3 - 2 * dv2dy / 3
+    tau_11 = (4 * dv1dx - 2 * dv2dy) / 3
     # ((v1)_y + (v2)_x)
     # stress tensor is symmetric
     tau_12 = dv1dy + dv2dx # = tau_21
     # (4/3 * (v2)_y - 2/3 * (v1)_x)
-    tau_22 = 4 * dv2dy / 3 - 2 * dv1dx / 3
+    tau_22 = (4 * dv2dy - 2 * dv1dx) / 3
 
     # Fick's law q = -kappa * grad(T) = -kappa * grad(p / (R rho))
     # with thermal diffusivity constant kappa = gamma μ R / ((gamma-1) Pr)
@@ -274,11 +274,20 @@ end
     prim2cons(u, equations.equations_hyperbolic)
 end
 
+"""
+    temperature(u, equations::CompressibleNavierStokesDiffusion2D)
+
+Compute the temperature from the conservative variables `u`.
+In particular, this assumes a specific gas constant ``R = 1``:
+```math
+T = \\frac{p}{\\rho}
+```
+"""
 @inline function temperature(u, equations::CompressibleNavierStokesDiffusion2D)
     rho, rho_v1, rho_v2, rho_e = u
 
     p = (equations.gamma - 1) * (rho_e - 0.5f0 * (rho_v1^2 + rho_v2^2) / rho)
-    T = p / rho
+    T = p / rho # Corresponds to a specific gas constant R = 1
     return T
 end
 
diff --git a/src/equations/compressible_navier_stokes_3d.jl b/src/equations/compressible_navier_stokes_3d.jl
index fa6075b5a2f..6c615a11ced 100644
--- a/src/equations/compressible_navier_stokes_3d.jl
+++ b/src/equations/compressible_navier_stokes_3d.jl
@@ -164,11 +164,11 @@ function flux(u, gradients, orientation::Integer,
 
     # Diagonal parts
     # (4 * (v1)_x / 3 - 2 * ((v2)_y + (v3)_z)) / 3)
-    tau_11 = 4 * dv1dx / 3 - 2 * (dv2dy + dv3dz) / 3
+    tau_11 = (4 * dv1dx - 2 * (dv2dy + dv3dz)) / 3
     # (4 * (v2)_y / 3 - 2 * ((v1)_x + (v3)_z) / 3)
-    tau_22 = 4 * dv2dy / 3 - 2 * (dv1dx + dv3dz) / 3
+    tau_22 = (4 * dv2dy - 2 * (dv1dx + dv3dz)) / 3
     # (4 * (v3)_z / 3 - 2 * ((v1)_x + (v2)_y) / 3)
-    tau_33 = 4 * dv3dz / 3 - 2 * (dv1dx + dv2dy) / 3
+    tau_33 = (4 * dv3dz - 2 * (dv1dx + dv2dy)) / 3
 
     # Off diagonal parts, exploit that stress tensor is symmetric
     # ((v1)_y + (v2)_x)
@@ -302,11 +302,20 @@ end
     prim2cons(u, equations.equations_hyperbolic)
 end
 
+"""
+    temperature(u, equations::CompressibleNavierStokesDiffusion3D)
+
+Compute the temperature from the conservative variables `u`.
+In particular, this assumes a specific gas constant ``R = 1``:
+```math
+T = \\frac{p}{\\rho}
+```
+"""
 @inline function temperature(u, equations::CompressibleNavierStokesDiffusion3D)
     rho, rho_v1, rho_v2, rho_v3, rho_e = u
 
     p = (equations.gamma - 1) * (rho_e - 0.5f0 * (rho_v1^2 + rho_v2^2 + rho_v3^2) / rho)
-    T = p / rho
+    T = p / rho # Corresponds to a specific gas constant R = 1
     return T
 end
 
diff --git a/src/equations/hyperbolic_diffusion_1d.jl b/src/equations/hyperbolic_diffusion_1d.jl
index 804a3e0b499..48601dfd675 100644
--- a/src/equations/hyperbolic_diffusion_1d.jl
+++ b/src/equations/hyperbolic_diffusion_1d.jl
@@ -44,7 +44,7 @@ end
 """
     initial_condition_poisson_nonperiodic(x, t, equations::HyperbolicDiffusionEquations1D)
 
-A non-priodic smooth initial condition. Can be used for convergence tests in combination with
+A non-periodic smooth initial condition. Can be used for convergence tests in combination with
 [`source_terms_poisson_nonperiodic`](@ref) and [`boundary_condition_poisson_nonperiodic`](@ref).
 !!! note
     The solution is periodic but the initial guess is not.
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index f402aad2ebd..b08d2d3de15 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -185,6 +185,11 @@ function get_element_variables!(element_variables, u, mesh, equations,
                            volume_integral)
 end
 
+# Abstract supertype for first-order `VolumeIntegralPureLGLFiniteVolume` and
+# second-order `VolumeIntegralPureLGLFiniteVolumeO2` subcell-based finite volume
+# volume integrals.
+abstract type AbstractVolumeIntegralPureLGLFiniteVolume <: AbstractVolumeIntegral end
+
 """
     VolumeIntegralPureLGLFiniteVolume(volume_flux_fv)
 
@@ -203,7 +208,8 @@ mesh (LGL = Legendre-Gauss-Lobatto).
   "A provably entropy stable subcell shock capturing approach for high order split form DG"
   [arXiv: 2008.12044](https://arxiv.org/abs/2008.12044)
 """
-struct VolumeIntegralPureLGLFiniteVolume{VolumeFluxFV} <: AbstractVolumeIntegral
+struct VolumeIntegralPureLGLFiniteVolume{VolumeFluxFV} <:
+       AbstractVolumeIntegralPureLGLFiniteVolume
     volume_flux_fv::VolumeFluxFV # non-symmetric in general, e.g. entropy-dissipative
 end
 # TODO: Figure out if this can also be used for Gauss nodes, not just LGL, and adjust the name accordingly
@@ -222,6 +228,85 @@ function Base.show(io::IO, ::MIME"text/plain",
     end
 end
 
+"""
+    VolumeIntegralPureLGLFiniteVolumeO2(basis::Basis, volume_flux_fv;
+                                        reconstruction_mode = reconstruction_O2_full,
+                                        slope_limiter = minmod)
+
+This gives an up to  second order accurate finite volume scheme on an LGL-type subcell
+mesh (LGL = Legendre-Gauss-Lobatto).
+Depending on the `reconstruction_mode` and `slope_limiter`, experimental orders of convergence
+between 1 and 2 can be expected in practice.
+Since this is a volume integral, all reconstructions are purely cell-local, i.e.,
+no neighboring elements are queried at reconstruction stage.
+
+The interface values of the inner DG-subcells are reconstructed using the standard MUSCL-type reconstruction.
+For the DG-subcells at the boundaries, two options are available:
+
+1) The unlimited slope is used on these cells.
+   This gives full second order accuracy, but also does not damp overshoots between cells.
+   The `reconstruction_mode` corresponding to this is `reconstruction_O2_full`.
+2) On boundary subcells, the solution is represented using a constant value, thereby falling back to formally only first order.
+   The `reconstruction_mode` corresponding to this is `reconstruction_O2_inner`.
+   In the reference below, this is the recommended reconstruction mode and is thus used by default.
+
+!!! note "Conservative Systems only"
+    Currently only implemented for systems in conservative form, i.e.,
+    `have_nonconservative_terms(equations) = False()`
+
+!!! warning "Experimental implementation"
+    This is an experimental feature and may change in future releases.
+
+## References
+
+See especially Sections 3.2, Section 4, and Appendix D of the paper
+
+- Rueda-Ramírez, Hennemann, Hindenlang, Winters, & Gassner (2021).
+  "An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations. 
+   Part II: Subcell finite volume shock capturing"
+  [JCP: 2021.110580](https://doi.org/10.1016/j.jcp.2021.110580)
+"""
+struct VolumeIntegralPureLGLFiniteVolumeO2{RealT <: Real, Basis, VolumeFluxFV,
+                                           Reconstruction, Limiter} <:
+       AbstractVolumeIntegralPureLGLFiniteVolume
+    x_interfaces::Vector{RealT} # x-coordinates of the sub-cell element interfaces
+    volume_flux_fv::VolumeFluxFV # non-symmetric in general, e.g. entropy-dissipative
+    reconstruction_mode::Reconstruction # which type of FV reconstruction to use
+    slope_limiter::Limiter # which type of slope limiter function
+end
+
+function VolumeIntegralPureLGLFiniteVolumeO2(basis::Basis, volume_flux_fv;
+                                             reconstruction_mode = reconstruction_O2_full,
+                                             slope_limiter = minmod) where {Basis}
+    # Suffices to store only the intermediate boundaries of the sub-cell elements                                             
+    x_interfaces = cumsum(basis.weights)[1:(end - 1)] .- 1
+    VolumeIntegralPureLGLFiniteVolumeO2{eltype(basis.weights),
+                                        typeof(basis),
+                                        typeof(volume_flux_fv),
+                                        typeof(reconstruction_mode),
+                                        typeof(slope_limiter)}(x_interfaces,
+                                                               volume_flux_fv,
+                                                               reconstruction_mode,
+                                                               slope_limiter)
+end
+
+function Base.show(io::IO, ::MIME"text/plain",
+                   integral::VolumeIntegralPureLGLFiniteVolumeO2)
+    @nospecialize integral # reduce precompilation time
+
+    if get(io, :compact, false)
+        show(io, integral)
+    else
+        setup = [
+            "FV flux" => integral.volume_flux_fv,
+            "Reconstruction" => integral.reconstruction_mode,
+            "Slope limiter" => integral.slope_limiter,
+            "Subcell boundaries" => vcat([-1.0], integral.x_interfaces, [1.0])
+        ]
+        summary_box(io, "VolumeIntegralPureLGLFiniteVolumeO2", setup)
+    end
+end
+
 """
     VolumeIntegralSubcellLimiting(limiter;
                                   volume_flux_dg, volume_flux_fv)
diff --git a/src/solvers/dgmulti/flux_differencing.jl b/src/solvers/dgmulti/flux_differencing.jl
index 47750ffd5a0..458e06e88b6 100644
--- a/src/solvers/dgmulti/flux_differencing.jl
+++ b/src/solvers/dgmulti/flux_differencing.jl
@@ -234,7 +234,7 @@ end
 end
 
 # Return the contravariant basis vector corresponding to the Cartesian
-# coordinate diretion `orientation` in a given `element` of the `mesh`.
+# coordinate direction `orientation` in a given `element` of the `mesh`.
 # The contravariant basis vectors have entries `dx_i / dxhat_j` where
 # j ∈ {1, ..., NDIMS}. Here, `x_i` and `xhat_j` are the ith physical coordinate
 # and jth reference coordinate, respectively. These are geometric terms which
diff --git a/src/solvers/dgsem/calc_volume_integral.jl b/src/solvers/dgsem/calc_volume_integral.jl
index e0041305e88..84c914c340f 100644
--- a/src/solvers/dgsem/calc_volume_integral.jl
+++ b/src/solvers/dgsem/calc_volume_integral.jl
@@ -32,8 +32,8 @@ function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                volume_integral::VolumeIntegralFluxDifferencing,
                                dg::DGSEM, cache)
     @threaded for element in eachelement(dg, cache)
-        flux_differencing_kernel!(du, u, element, mesh, have_nonconservative_terms,
-                                  equations,
+        flux_differencing_kernel!(du, u, element, mesh,
+                                  have_nonconservative_terms, equations,
                                   volume_integral.volume_flux, dg, cache)
     end
 
@@ -70,9 +70,9 @@ function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                       volume_flux_dg, dg, cache, 1 - alpha_element)
 
             # Calculate FV volume integral contribution
-            fv_kernel!(du, u, mesh, have_nonconservative_terms, equations,
-                       volume_flux_fv,
-                       dg, cache, element, alpha_element)
+            fv_kernel!(du, u, mesh,
+                       have_nonconservative_terms, equations,
+                       volume_flux_fv, dg, cache, element, alpha_element)
         end
     end
 
@@ -87,8 +87,9 @@ function calc_volume_integral!(backend::Nothing, du, u, mesh,
 
     # Calculate LGL FV volume integral
     @threaded for element in eachelement(dg, cache)
-        fv_kernel!(du, u, mesh, have_nonconservative_terms, equations, volume_flux_fv,
-                   dg, cache, element, true)
+        fv_kernel!(du, u, mesh,
+                   have_nonconservative_terms, equations,
+                   volume_flux_fv, dg, cache, element, true)
     end
 
     return nothing
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 8d56fdf7515..7d263b5fa2e 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -246,7 +246,7 @@ function calc_gradient!(gradients, u_transformed, t,
                                       dg)
     end
 
-    # Prolong solution to mortars. This resues the hyperbolic version of `prolong2mortars`
+    # Prolong solution to mortars. This reuses the hyperbolic version of `prolong2mortars`
     @trixi_timeit timer() "prolong2mortars" begin
         prolong2mortars!(cache, u_transformed, mesh, equations_parabolic,
                          dg.mortar, dg)
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 557b5c3364f..6cc2791c27e 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -35,6 +35,50 @@ function calc_boundary_flux!(cache, u, t, boundary_condition::BoundaryConditionP
     @assert isperiodic(mesh)
 end
 
+function rhs!(du, u, t,
+              mesh::Union{StructuredMesh, StructuredMeshView{2}}, equations,
+              boundary_conditions, source_terms::Source,
+              dg::DG, cache) where {Source}
+    # Reset du
+    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
+
+    # Calculate volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral!(du, u, mesh,
+                              have_nonconservative_terms(equations), equations,
+                              dg.volume_integral, dg, cache)
+    end
+
+    # Calculate interface and boundary fluxes
+    @trixi_timeit timer() "interface flux" begin
+        calc_interface_flux!(cache, u, mesh,
+                             have_nonconservative_terms(equations), equations,
+                             dg.surface_integral, dg)
+    end
+
+    # Calculate boundary fluxes
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux!(cache, u, t, boundary_conditions, mesh, equations,
+                            dg.surface_integral, dg)
+    end
+
+    # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral!(du, u, mesh, equations,
+                               dg.surface_integral, dg, cache)
+    end
+
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+
+    # Calculate source terms
+    @trixi_timeit timer() "source terms" begin
+        calc_sources!(du, u, t, source_terms, equations, dg, cache)
+    end
+
+    return nothing
+end
+
 @inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
                                                   orientation,
                                                   boundary_condition::BoundaryConditionPeriodic,
diff --git a/src/solvers/dgsem_structured/dg_1d.jl b/src/solvers/dgsem_structured/dg_1d.jl
index 0a9618c6d9a..8417c709338 100644
--- a/src/solvers/dgsem_structured/dg_1d.jl
+++ b/src/solvers/dgsem_structured/dg_1d.jl
@@ -5,49 +5,8 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(backend, du, u, t,
-              mesh::StructuredMesh{1}, equations,
-              boundary_conditions, source_terms::Source,
-              dg::DG, cache) where {Source}
-    # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
-
-    # Calculate volume integral
-    @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(backend, du, u, mesh,
-                              have_nonconservative_terms(equations), equations,
-                              dg.volume_integral, dg, cache)
-    end
-
-    # Calculate interface and boundary fluxes
-    @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache, u, mesh, equations, dg.surface_integral, dg)
-    end
-
-    # Calculate boundary fluxes
-    @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux!(cache, u, t, boundary_conditions, mesh, equations,
-                            dg.surface_integral, dg)
-    end
-
-    # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
-                               dg.surface_integral, dg, cache)
-    end
-
-    # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
-
-    # Calculate source terms
-    @trixi_timeit timer() "source terms" begin
-        calc_sources!(du, u, t, source_terms, equations, dg, cache)
-    end
-
-    return nothing
-end
-
 function calc_interface_flux!(cache, u, mesh::StructuredMesh{1},
+                              nonconservative_terms, # can be True/False
                               equations, surface_integral, dg::DG)
     @unpack surface_flux = surface_integral
 
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index b74ab435228..6430b61b276 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -5,50 +5,6 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(backend, du, u, t,
-              mesh::Union{StructuredMesh{2}, StructuredMeshView{2}}, equations,
-              boundary_conditions, source_terms::Source,
-              dg::DG, cache) where {Source}
-    # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
-
-    # Calculate volume integral
-    @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(backend, du, u, mesh,
-                              have_nonconservative_terms(equations), equations,
-                              dg.volume_integral, dg, cache)
-    end
-
-    # Calculate interface fluxes
-    @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache, u, mesh,
-                             have_nonconservative_terms(equations), equations,
-                             dg.surface_integral, dg)
-    end
-
-    # Calculate boundary fluxes
-    @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux!(cache, u, t, boundary_conditions, mesh, equations,
-                            dg.surface_integral, dg)
-    end
-
-    # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
-                               dg.surface_integral, dg, cache)
-    end
-
-    # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
-
-    # Calculate source terms
-    @trixi_timeit timer() "source terms" begin
-        calc_sources!(du, u, t, source_terms, equations, dg, cache)
-    end
-
-    return nothing
-end
-
 #=
 `weak_form_kernel!` is only implemented for conserved terms as
 non-conservative terms should always be discretized in conjunction with a flux-splitting scheme,
diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index 64f03d30dca..cd39623a367 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -5,51 +5,6 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(backend, du, u, t,
-              mesh::StructuredMesh{3}, equations,
-              boundary_conditions, source_terms::Source,
-              dg::DG, cache) where {Source}
-    # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
-
-    # Calculate volume integral
-    @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(backend, du, u, mesh,
-                              have_nonconservative_terms(equations), equations,
-                              dg.volume_integral, dg, cache)
-    end
-
-    # Calculate interface fluxes
-    @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache, u, mesh,
-                             have_nonconservative_terms(equations), equations,
-                             dg.surface_integral, dg)
-    end
-
-    # Calculate boundary fluxes
-    @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux!(cache, u, t, boundary_conditions, mesh, equations,
-                            dg.surface_integral, dg)
-    end
-
-    # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(backend, du, u, mesh, equations,
-                               dg.surface_integral, dg, cache)
-    end
-
-    # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
-                                                     cache)
-
-    # Calculate source terms
-    @trixi_timeit timer() "source terms" begin
-        calc_sources!(du, u, t, source_terms, equations, dg, cache)
-    end
-
-    return nothing
-end
-
 function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
diff --git a/src/solvers/dgsem_tree/dg.jl b/src/solvers/dgsem_tree/dg.jl
index 125773c1fd5..af4615726b0 100644
--- a/src/solvers/dgsem_tree/dg.jl
+++ b/src/solvers/dgsem_tree/dg.jl
@@ -38,6 +38,10 @@ include("dg_parallel.jl")
 # Helper structs for parabolic AMR
 include("containers_viscous.jl")
 
+# Some functions for a second-order Finite-Volume (MUSCL) alike 
+# scheme on DG-subcells.
+include("subcell_finite_volume_O2.jl")
+
 # 1D DG implementation
 include("dg_1d.jl")
 include("dg_1d_parabolic.jl")
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index 6f6d3dc3385..986bc6d6830 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -50,8 +50,8 @@ function create_cache(mesh::Union{TreeMesh{1}, StructuredMesh{1}}, equations,
 end
 
 function create_cache(mesh::Union{TreeMesh{1}, StructuredMesh{1}}, equations,
-                      volume_integral::VolumeIntegralPureLGLFiniteVolume, dg::DG,
-                      uEltype)
+                      volume_integral::AbstractVolumeIntegralPureLGLFiniteVolume,
+                      dg::DG, uEltype)
     A2dp1_x = Array{uEltype, 2}
     fstar1_L_threaded = A2dp1_x[A2dp1_x(undef, nvariables(equations), nnodes(dg) + 1)
                                 for _ in 1:Threads.nthreads()]
@@ -217,14 +217,59 @@ end
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded = cache
-    @unpack inverse_weights = dg.basis
+    @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
 
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, u, mesh, have_nonconservative_terms, equations,
-                 volume_flux_fv,
-                 dg, element, cache)
+    calcflux_fv!(fstar1_L, fstar1_R, u, mesh,
+                 have_nonconservative_terms, equations,
+                 volume_flux_fv, dg, element, cache)
+
+    # Calculate FV volume integral contribution
+    for i in eachnode(dg)
+        for v in eachvariable(equations)
+            du[v, i, element] += (alpha *
+                                  (inverse_weights[i] *
+                                   (fstar1_L[v, i + 1] - fstar1_R[v, i])))
+        end
+    end
+
+    return nothing
+end
+
+function calc_volume_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                               have_nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralPureLGLFiniteVolumeO2,
+                               dg::DGSEM, cache)
+    @unpack x_interfaces, volume_flux_fv, reconstruction_mode, slope_limiter = volume_integral
+
+    # Calculate LGL second-order FV volume integral
+    @threaded for element in eachelement(dg, cache)
+        fvO2_kernel!(du, u, mesh,
+                     have_nonconservative_terms, equations,
+                     volume_flux_fv, dg, cache, element,
+                     x_interfaces, reconstruction_mode, slope_limiter, true)
+    end
+
+    return nothing
+end
+
+@inline function fvO2_kernel!(du, u,
+                              mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                              nonconservative_terms, equations,
+                              volume_flux_fv, dg::DGSEM, cache, element,
+                              x_interfaces, reconstruction_mode, slope_limiter,
+                              alpha = true)
+    @unpack fstar1_L_threaded, fstar1_R_threaded = cache
+    @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
+
+    # Calculate FV two-point fluxes
+    fstar1_L = fstar1_L_threaded[Threads.threadid()]
+    fstar1_R = fstar1_R_threaded[Threads.threadid()]
+    calcflux_fvO2!(fstar1_L, fstar1_R, u, mesh, nonconservative_terms, equations,
+                   volume_flux_fv, dg, element, cache,
+                   x_interfaces, reconstruction_mode, slope_limiter)
 
     # Calculate FV volume integral contribution
     for i in eachnode(dg)
@@ -291,6 +336,74 @@ end
     return nothing
 end
 
+@inline function calcflux_fvO2!(fstar1_L, fstar1_R, u::AbstractArray{<:Any, 3},
+                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                nonconservative_terms::False,
+                                equations, volume_flux_fv, dg::DGSEM, element, cache,
+                                x_interfaces, reconstruction_mode, slope_limiter)
+    fstar1_L[:, 1] .= zero(eltype(fstar1_L))
+    fstar1_L[:, nnodes(dg) + 1] .= zero(eltype(fstar1_L))
+    fstar1_R[:, 1] .= zero(eltype(fstar1_R))
+    fstar1_R[:, nnodes(dg) + 1] .= zero(eltype(fstar1_R))
+
+    for i in 2:nnodes(dg) # We compute FV02 fluxes at the (nnodes(dg) - 1) subcell boundaries
+        #             Reference element:             
+        #  -1 ------------------0------------------ 1 -> x
+        # Gauss-Lobatto-Legendre nodes (schematic for k = 3):
+        #   .          .                  .         .
+        #   ^          ^                  ^         ^
+        # Node indices:
+        #   1          2                  3         4
+        # The inner subcell boundaries are governed by the
+        # cumulative sum of the quadrature weights - 1 .
+        #  -1 ------------------0------------------ 1 -> x
+        #        w1-1      (w1+w2)-1   (w1+w2+w3)-1
+        #   |     |             |             |     |
+        # Note that only the inner boundaries are stored.
+        # Subcell interface indices, loop only over 2 -> nnodes(dg) = 4
+        #   1     2             3             4     5
+        #
+        # In general a four-point stencil is required, since we reconstruct the
+        # piecewise linear solution in both subcells next to the subcell interface.
+        # Since these subcell boundaries are not aligned with the DG nodes,
+        # on each neighboring subcell two linear solutions are reconstructed => 4 point stencil.
+        # For the outer interfaces the stencil shrinks since we do not consider values 
+        # outside the element (this is a volume integral).
+        # 
+        # The left subcell node values are labelled `_ll` (left-left) and `_lr` (left-right), while
+        # the right subcell node values are labelled `_rl` (right-left) and `_rr` (right-right).
+
+        ## Obtain unlimited values in primitive variables ##
+
+        # Note: If i - 2 = 0 we do not go to neighbor element, as one would do in a finite volume scheme.
+        # Here, we keep it purely cell-local, thus overshoots between elements are not ruled out.
+        u_ll = cons2prim(get_node_vars(u, equations, dg, max(1, i - 2), element),
+                         equations)
+        u_lr = cons2prim(get_node_vars(u, equations, dg, i - 1, element),
+                         equations)
+        u_rl = cons2prim(get_node_vars(u, equations, dg, i, element),
+                         equations)
+        # Note: If i + 1 > nnodes(dg) we do not go to neighbor element, as one would do in a finite volume scheme.
+        # Here, we keep it purely cell-local, thus overshoots between elements are not ruled out.
+        u_rr = cons2prim(get_node_vars(u, equations, dg, min(nnodes(dg), i + 1),
+                                       element), equations)
+
+        ## Reconstruct values at interfaces with limiting ##
+        u_l, u_r = reconstruction_mode(u_ll, u_lr, u_rl, u_rr,
+                                       x_interfaces, i,
+                                       slope_limiter, dg)
+
+        ## Convert primitive variables back to conservative variables ##
+        flux = volume_flux_fv(prim2cons(u_l, equations), prim2cons(u_r, equations),
+                              1, equations) # orientation 1: x direction
+
+        set_node_vars!(fstar1_L, flux, equations, dg, i)
+        set_node_vars!(fstar1_R, flux, equations, dg, i)
+    end
+
+    return nothing
+end
+
 function prolong2interfaces!(cache, u, mesh::TreeMesh{1}, equations, dg::DG)
     @unpack interfaces = cache
     @unpack neighbor_ids = interfaces
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 57f7bf81ec6..0d1b3c885b8 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -103,7 +103,8 @@ end
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
 function rhs!(backend, du, u, t,
-              mesh::Union{TreeMesh{2}, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+              mesh::Union{TreeMesh{2}, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2},
+                          TreeMesh{3}, P4estMesh{3}, T8codeMesh{3}},
               equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -295,7 +296,7 @@ end
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded = cache
-    @unpack inverse_weights = dg.basis
+    @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
 
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
index bb1126c02f9..04889cae459 100644
--- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
@@ -84,7 +84,7 @@ end
                                           have_nonconservative_terms, equations,
                                           volume_integral, limiter::SubcellLimiterIDP,
                                           dg::DGSEM, cache)
-    @unpack inverse_weights = dg.basis
+    @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
     @unpack volume_flux_dg, volume_flux_fv = volume_integral
 
     # high-order DG fluxes
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 27a6158c637..664a8e168ef 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -122,77 +122,6 @@ function create_cache(mesh::TreeMesh{3}, equations,
     return cache
 end
 
-# TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
-
-function rhs!(backend, du, u, t,
-              mesh::Union{TreeMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations,
-              boundary_conditions, source_terms::Source,
-              dg::DG, cache) where {Source}
-    # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
-
-    # Calculate volume integral
-    @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(backend, du, u, mesh,
-                              have_nonconservative_terms(equations), equations,
-                              dg.volume_integral, dg, cache)
-    end
-
-    # Prolong solution to interfaces
-    @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
-    end
-
-    # Calculate interface fluxes
-    @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
-                             have_nonconservative_terms(equations), equations,
-                             dg.surface_integral, dg, cache)
-    end
-
-    # Prolong solution to boundaries
-    @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, u, mesh, equations,
-                            dg.surface_integral, dg)
-    end
-
-    # Calculate boundary fluxes
-    @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                            dg.surface_integral, dg)
-    end
-
-    # Prolong solution to mortars
-    @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars!(cache, u, mesh, equations,
-                         dg.mortar, dg)
-    end
-
-    # Calculate mortar fluxes
-    @trixi_timeit timer() "mortar flux" begin
-        calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
-                          have_nonconservative_terms(equations), equations,
-                          dg.mortar, dg.surface_integral, dg, cache)
-    end
-
-    # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(backend, du, u, mesh, equations,
-                               dg.surface_integral, dg, cache)
-    end
-
-    # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
-                                                     cache)
-
-    # Calculate source terms
-    @trixi_timeit timer() "source terms" begin
-        calc_sources!(du, u, t, source_terms, equations, dg, cache)
-    end
-
-    return nothing
-end
-
 #=
 `weak_form_kernel!` is only implemented for conserved terms as
 non-conservative terms should always be discretized in conjunction with a flux-splitting scheme,
@@ -343,7 +272,7 @@ end
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
-    @unpack inverse_weights = dg.basis
+    @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
 
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
diff --git a/src/solvers/dgsem_tree/subcell_finite_volume_O2.jl b/src/solvers/dgsem_tree/subcell_finite_volume_O2.jl
new file mode 100644
index 00000000000..589b573154b
--- /dev/null
+++ b/src/solvers/dgsem_tree/subcell_finite_volume_O2.jl
@@ -0,0 +1,247 @@
+"""
+    reconstruction_constant(u_ll, u_lr, u_rl, u_rr,
+                            x_interfaces,
+                            node_index, limiter, dg)
+
+Returns the constant "reconstructed" values `u_lr, u_rl` at the interface `x_interfaces[node_index - 1]`.
+Supposed to be used in conjunction with [`VolumeIntegralPureLGLFiniteVolumeO2`](@ref).
+Formally first order accurate.
+If a first-order finite volume scheme is desired, [`VolumeIntegralPureLGLFiniteVolume`](@ref) is an
+equivalent, but more efficient choice.
+"""
+@inline function reconstruction_constant(u_ll, u_lr, u_rl, u_rr,
+                                         x_interfaces, node_index,
+                                         limiter, dg)
+    return u_lr, u_rl
+end
+
+# Helper functions for reconstructions below
+@inline function reconstruction_linear(u_lr, u_rl, s_l, s_r,
+                                       x_lr, x_rl, x_interfaces, node_index)
+    # Linear reconstruction at the interface
+    u_lr = u_lr + s_l * (x_interfaces[node_index - 1] - x_lr)
+    u_rl = u_rl + s_r * (x_interfaces[node_index - 1] - x_rl)
+
+    return u_lr, u_rl
+end
+
+#             Reference element:             
+#  -1 ------------------0------------------ 1 -> x
+# Gauss-Lobatto-Legendre nodes (schematic for k = 3):
+#   .          .                  .         .
+#   ^          ^                  ^         ^
+# Node indices:
+#   1          2                  3         4
+# The inner subcell boundaries are governed by the
+# cumulative sum of the quadrature weights - 1 .
+#  -1 ------------------0------------------ 1 -> x
+#        w1-1      (w1+w2)-1   (w1+w2+w3)-1
+#   |     |             |             |     |
+# Note that only the inner boundaries are stored.
+# Subcell interface indices, loop only over 2 -> nnodes(dg) = 4
+#   1     2             3             4     5
+#
+# In general a four-point stencil is required, since we reconstruct the
+# piecewise linear solution in both subcells next to the subcell interface.
+# Since these subcell boundaries are not aligned with the DG nodes,
+# on each neighboring subcell two linear solutions are reconstructed => 4 point stencil.
+# For the outer interfaces the stencil shrinks since we do not consider values 
+# outside the element (volume integral).
+# 
+# The left subcell node values are labelled `_ll` (left-left) and `_lr` (left-right), while
+# the right subcell node values are labelled `_rl` (right-left) and `_rr` (right-right).
+
+"""
+    reconstruction_O2_full(u_ll, u_lr, u_rl, u_rr,
+                           x_interfaces, node_index,
+                           limiter, dg::DGSEM)
+
+Returns the reconstructed values `u_lr, u_rl` at the interface `x_interfaces[node_index - 1]`.
+Computes limited (linear) slopes on the subcells for a DGSEM element.
+Supposed to be used in conjunction with [`VolumeIntegralPureLGLFiniteVolumeO2`](@ref).
+
+The supplied `limiter` governs the choice of slopes given the nodal values
+`u_ll`, `u_lr`, `u_rl`, and `u_rr` at the (Gauss-Lobatto Legendre) nodes.
+Total-Variation-Diminishing (TVD) choices for the limiter are
+    1) [`minmod`](@ref)
+    2) [`monotonized_central`](@ref)
+    3) [`superbee`](@ref)
+    4) [`vanLeer`](@ref)
+
+The reconstructed slopes are for `reconstruction_O2_full` not limited at the cell boundaries.
+Formally second order accurate when used without a limiter, i.e., `limiter = `[`central_slope`](@ref).
+This approach corresponds to equation (79) described in
+- Rueda-Ramírez, Hennemann, Hindenlang, Winters, & Gassner (2021).
+  "An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations.
+   Part II: Subcell finite volume shock capturing"
+  [JCP: 2021.110580](https://doi.org/10.1016/j.jcp.2021.110580)
+"""
+@inline function reconstruction_O2_full(u_ll, u_lr, u_rl, u_rr,
+                                        x_interfaces, node_index,
+                                        limiter, dg::DGSEM)
+    @unpack nodes = dg.basis
+    x_lr = nodes[node_index - 1]
+    x_rl = nodes[node_index]
+
+    # Slope between "middle" nodes
+    s_m = (u_rl - u_lr) / (x_rl - x_lr)
+
+    if node_index == 2 # Catch case ll == lr
+        s_l = s_m # Use unlimited "central" slope
+    else
+        x_ll = nodes[node_index - 2]
+        # Slope between "left" nodes
+        s_lr = (u_lr - u_ll) / (x_lr - x_ll)
+        # Select slope between extrapolated (left) and crossing (middle) slope
+        s_l = limiter.(s_lr, s_m)
+    end
+
+    if node_index == nnodes(dg) # Catch case rl == rr
+        s_r = s_m # Use unlimited "central" slope
+    else
+        x_rr = nodes[node_index + 1]
+        # Slope between "right" nodes
+        s_rl = (u_rr - u_rl) / (x_rr - x_rl)
+        # Select slope between crossing (middle) and extrapolated (right) slope
+        s_r = limiter.(s_m, s_rl)
+    end
+
+    return reconstruction_linear(u_lr, u_rl, s_l, s_r,
+                                 x_lr, x_rl, x_interfaces, node_index)
+end
+
+"""
+    reconstruction_O2_inner(u_ll, u_lr, u_rl, u_rr,
+                            x_interfaces, node_index,
+                            limiter, dg::DGSEM)
+
+Returns the reconstructed values `u_lr, u_rl` at the interface `x_interfaces[node_index - 1]`.
+Computes limited (linear) slopes on the *inner* subcells for a DGSEM element.
+Supposed to be used in conjunction with [`VolumeIntegralPureLGLFiniteVolumeO2`](@ref).
+
+The supplied `limiter` governs the choice of slopes given the nodal values
+`u_ll`, `u_lr`, `u_rl`, and `u_rr` at the (Gauss-Lobatto Legendre) nodes.
+Total-Variation-Diminishing (TVD) choices for the limiter are
+    1) [`minmod`](@ref)
+    2) [`monotonized_central`](@ref)
+    3) [`superbee`](@ref)
+    4) [`vanLeer`](@ref)
+
+For the outer, i.e., boundary subcells, constant values are used, i.e, no reconstruction.
+This reduces the order of the scheme below 2.
+This approach corresponds to equation (78) described in
+- Rueda-Ramírez, Hennemann, Hindenlang, Winters, & Gassner (2021).
+  "An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations. 
+   Part II: Subcell finite volume shock capturing"
+  [JCP: 2021.110580](https://doi.org/10.1016/j.jcp.2021.110580)
+"""
+@inline function reconstruction_O2_inner(u_ll, u_lr, u_rl, u_rr,
+                                         x_interfaces, node_index,
+                                         limiter, dg::DGSEM)
+    @unpack nodes = dg.basis
+    x_lr = nodes[node_index - 1]
+    x_rl = nodes[node_index]
+
+    # Slope between "middle" nodes
+    s_m = (u_rl - u_lr) / (x_rl - x_lr)
+
+    if node_index == 2 # Catch case ll == lr
+        # Do not reconstruct at the boundary
+        s_l = zero(s_m)
+    else
+        x_ll = nodes[node_index - 2]
+        # Slope between "left" nodes
+        s_lr = (u_lr - u_ll) / (x_lr - x_ll)
+        # Select slope between extrapolated (left) and crossing (middle) slope
+        s_l = limiter.(s_lr, s_m)
+    end
+
+    if node_index == nnodes(dg) # Catch case rl == rr
+        # Do not reconstruct at the boundary
+        s_r = zero(s_m)
+    else
+        x_rr = nodes[node_index + 1]
+        # Slope between "right" nodes
+        s_rl = (u_rr - u_rl) / (x_rr - x_rl)
+        # Select slope between crossing (middle) and extrapolated (right) slope
+        s_r = limiter.(s_m, s_rl)
+    end
+
+    return reconstruction_linear(u_lr, u_rl, s_l, s_r,
+                                 x_lr, x_rl, x_interfaces, node_index)
+end
+
+"""
+    central_slope(sl, sr)
+
+Central, non-TVD reconstruction given left and right slopes `sl` and `sr`.
+Gives formally full order of accuracy at the expense of sacrificed nonlinear stability.
+Similar in spirit to [`flux_central`](@ref).
+"""
+@inline function central_slope(sl, sr)
+    return 0.5f0 * (sl + sr)
+end
+
+"""
+    minmod(sl, sr)
+
+Classic minmod limiter function for a TVD reconstruction given left and right slopes `sl` and `sr`.
+There are many different ways how the minmod limiter can be implemented.
+For reference, see for instance Eq. (6.27) in
+
+- Randall J. LeVeque (2002)
+  Finite Volume Methods for Hyperbolic Problems
+  [DOI: 10.1017/CBO9780511791253](https://doi.org/10.1017/CBO9780511791253)
+"""
+@inline function minmod(sl, sr)
+    return 0.5f0 * (sign(sl) + sign(sr)) * min(abs(sl), abs(sr))
+end
+
+"""
+    monotonized_central(sl, sr)
+
+Monotonized central limiter function for a TVD reconstruction given left and right slopes `sl` and `sr`.
+There are many different ways how the monotonized central limiter can be implemented.
+For reference, see for instance Eq. (6.29) in
+
+- Randall J. LeVeque (2002)
+  Finite Volume Methods for Hyperbolic Problems
+  [DOI: 10.1017/CBO9780511791253](https://doi.org/10.1017/CBO9780511791253)
+"""
+@inline function monotonized_central(sl, sr)
+    # Use recursive property of minmod function
+    return minmod(0.5f0 * (sl + sr), minmod(2 * sl, 2 * sr))
+end
+
+"""
+    superbee(sl, sr)
+
+Superbee limiter function for a TVD reconstruction given left and right slopes `sl` and `sr`.
+There are many different ways how the superbee limiter can be implemented.
+For reference, see for instance Eq. (6.28) in
+
+- Randall J. LeVeque (2002)
+  Finite Volume Methods for Hyperbolic Problems
+  [DOI: 10.1017/CBO9780511791253](https://doi.org/10.1017/CBO9780511791253)
+"""
+@inline function superbee(sl, sr)
+    return maxmod(minmod(sl, 2 * sr), minmod(2 * sl, sr))
+end
+
+"""
+    vanLeer(sl, sr)
+
+Symmetric limiter by van Leer.
+See for reference page 70 in 
+
+- Siddhartha Mishra, Ulrik Skre Fjordholm and Rémi Abgrall
+  Numerical methods for conservation laws and related equations.
+  [Link](https://metaphor.ethz.ch/x/2019/hs/401-4671-00L/literature/mishra_hyperbolic_pdes.pdf)
+"""
+@inline function vanLeer(sl, sr)
+    if abs(sl) + abs(sr) > zero(sl)
+        return (abs(sr) * sl + abs(sl) * sr) / (abs(sl) + abs(sr))
+    else
+        return zero(sl)
+    end
+end
diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
index c8e0373d9b6..cca91aa94b0 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
@@ -233,7 +233,7 @@ end
                                      semi, variable)
     mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
     (; antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R) = cache.antidiffusive_fluxes
-    (; inverse_weights) = dg.basis
+    (; inverse_weights) = dg.basis # Plays role of inverse DG-subcell sizes
 
     (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
     variable_string = string(variable)
diff --git a/src/solvers/dgsem_unstructured/dg_2d.jl b/src/solvers/dgsem_unstructured/dg_2d.jl
index e17197f843d..b5367b45d72 100644
--- a/src/solvers/dgsem_unstructured/dg_2d.jl
+++ b/src/solvers/dgsem_unstructured/dg_2d.jl
@@ -503,7 +503,7 @@ function calc_surface_integral!(du, u, mesh::UnstructuredMesh2D,
 end
 
 # This routine computes the maximum value of the discrete metric identities necessary to ensure
-# that the approxmiation will be free-stream preserving (i.e. a constant solution remains constant)
+# that the approximation will be free-stream preserving (i.e. a constant solution remains constant)
 # on a curvilinear mesh.
 #   Note! Independent of the equation system and is only a check on the discrete mapping terms.
 #         Can be used for a metric identities check on StructuredMesh{2} or UnstructuredMesh2D
diff --git a/test/test_parabolic_2d.jl b/test/test_parabolic_2d.jl
index 0d23b43ef4b..75f728ef6da 100644
--- a/test/test_parabolic_2d.jl
+++ b/test/test_parabolic_2d.jl
@@ -714,17 +714,21 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_navierstokes_viscous_shock_newton_krylov.jl"),
                         tspan=(0.0, 0.1),
+                        atol_lin_solve=1e-11,
+                        rtol_lin_solve=1e-11,
+                        atol_ode_solve=1e-10,
+                        rtol_ode_solve=1e-10,
                         l2=[
-                            3.468233560427797e-5,
-                            2.64864594855224e-5,
-                            7.879490760481979e-10,
-                            2.8748482665365446e-5
+                            3.428501006908931e-5,
+                            2.5967418005884837e-5,
+                            2.7084890458524478e-17,
+                            2.855861765163304e-5
                         ],
                         linf=[
-                            0.00018754529350140103,
-                            0.00014045634087878067,
-                            9.043610782328732e-9,
-                            0.00014499382160382268
+                            0.00018762342908784646,
+                            0.0001405900207752664,
+                            3.661971738081151e-16,
+                            0.00014510700486747297
                         ])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
@@ -884,19 +888,19 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_navierstokes_blast_reflective.jl"),
                         l2=[
-                            0.08271777454941344,
-                            0.10020048140682014,
-                            0.10020048140682006,
-                            0.5954017435122945
+                            0.015140702486341239,
+                            0.035675739843665635,
+                            0.035675739843665615,
+                            0.21415725909973524
                         ],
                         linf=[
-                            0.4785944470287504,
-                            0.7205772140501768,
-                            0.7205772140501767,
-                            3.25120873497427
+                            0.2339198598727935,
+                            0.5951310665112189,
+                            0.5951310665112187,
+                            3.0106576605775333
                         ],
-                        tspan=(0.0, 0.05),
-                        abstol=1e-7, reltol=1e-7)
+                        tspan=(0.0, 0.01),
+                        abstol=1e-11, reltol=1e-11)
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
diff --git a/test/test_structured_1d.jl b/test/test_structured_1d.jl
index 04398b5ed9a..daf8ac6e1af 100644
--- a/test/test_structured_1d.jl
+++ b/test/test_structured_1d.jl
@@ -149,6 +149,27 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_source_terms_nonperiodic_fvO2.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_source_terms_nonperiodic_fvO2.jl"),
+                        l2=[
+                            0.0005159476609077155,
+                            0.000649450399792432,
+                            0.0010602371635625239
+                        ],
+                        linf=[
+                            0.0017927309507015377,
+                            0.001662532939591621,
+                            0.004580416775184837
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+
+    # Test/cover `:compact` printing
+    show(IOContext(IOBuffer(), :compact => true), MIME"text/plain"(), volume_integral)
+end
+
 @trixi_testset "elixir_euler_weak_blast_er.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_euler_weak_blast_er.jl"),
diff --git a/test/test_tree_1d_euler.jl b/test/test_tree_1d_euler.jl
index b110c4fa465..614dcc1b370 100644
--- a/test/test_tree_1d_euler.jl
+++ b/test/test_tree_1d_euler.jl
@@ -55,6 +55,27 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_convergence_pure_fv.jl (O2, constant reconstruction)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_convergence_pure_fv.jl"),
+                        volume_integral=VolumeIntegralPureLGLFiniteVolumeO2(LobattoLegendreBasis(3),
+                                                                            flux_hllc,
+                                                                            reconstruction_mode = reconstruction_constant,
+                                                                            slope_limiter = central_slope),
+                        l2=[
+                            0.019355699748523896,
+                            0.022326984561234497,
+                            0.02523665947241734
+                        ],
+                        linf=[
+                            0.02895961127645519,
+                            0.03293442484199227,
+                            0.04246098278632804
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_euler_density_wave.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_density_wave.jl"),
                         l2=[
@@ -431,6 +452,23 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_convergence_pure_fvO2.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_convergence_pure_fvO2.jl"),
+                        l2=[
+                            0.0004651066144227485,
+                            0.0005058715155540577,
+                            0.0007705686813156139
+                        ],
+                        linf=[
+                            0.0014354711538595577,
+                            0.0014154880871579678,
+                            0.0027044481967184453
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_euler_laplace_diffusion.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_laplace_diffusion.jl"),
                         l2=[0.10954500481114468,
diff --git a/test/test_unit.jl b/test/test_unit.jl
index 54403a3e3c2..3123b4022de 100644
--- a/test/test_unit.jl
+++ b/test/test_unit.jl
@@ -2405,6 +2405,58 @@ end
                    1.803e-5, atol = 5e-8)
 end
 
+@testset "Slope Limiters" begin
+    sl = 1.0
+    sr = -1.0
+
+    # Test for code coverage
+    dummy = 42
+    @test reconstruction_constant(dummy, sl, sr, dummy, dummy, dummy, dummy, dummy) ==
+          (sl, sr)
+
+    @test minmod(sl, sr) == 0.0
+    @test monotonized_central(sl, sr) == 0.0
+    @test superbee(sl, sr) == 0.0
+    @test vanLeer(sl, sr) == 0.0
+
+    sr = 0.5
+    @test minmod(sl, sr) == 0.5
+    @test monotonized_central(sl, sr) == 0.75
+    @test superbee(sl, sr) == 1.0
+    @test isapprox(vanLeer(sl, sr), 2 / 3)
+
+    sl = -1.0
+    sr = 0.0
+    @test minmod(sl, sr) == 0.0
+    @test monotonized_central(sl, sr) == 0.0
+    @test superbee(sl, sr) == 0.0
+    @test vanLeer(sl, sr) == 0.0
+
+    sr = -0.8
+    @test minmod(sl, sr) == -0.8
+    @test monotonized_central(sl, sr) == -0.9
+    @test superbee(sl, sr) == -1.0
+    @test isapprox(vanLeer(sl, sr), -8 / 9)
+
+    # Test symmetry
+    @test minmod(sr, sl) == -0.8
+    @test monotonized_central(sr, sl) == -0.9
+    @test superbee(sr, sl) == -1.0
+    @test isapprox(vanLeer(sr, sl), -8 / 9)
+
+    sl = 1.0
+    sr = 0.0
+    @test minmod(sl, sr) == 0.0
+    @test monotonized_central(sl, sr) == 0.0
+    @test superbee(sl, sr) == 0.0
+    @test vanLeer(sl, sr) == 0.0
+
+    @test central_slope(sl, sr) == 0.5
+
+    # Test van Leer zero case
+    @test vanLeer(0.0, 0.0) == 0.0
+end
+
 # Velocity functions are present in many equations and are tested here
 @testset "Velocity functions for different equations" begin
     gamma = 1.4

From 013244d1bcfee588809908bd7bb865880add9f4b Mon Sep 17 00:00:00 2001
From: Benedict <135045760+benegee@users.noreply.github.com>
Date: Wed, 8 Oct 2025 09:56:16 +0200
Subject: [PATCH 66/81] Apply suggestions from code review

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 src/callbacks_step/save_solution.jl |  6 ++----
 src/callbacks_step/stepsize_dg1d.jl | 12 ++++++------
 src/callbacks_step/stepsize_dg2d.jl |  8 ++++----
 src/callbacks_step/stepsize_dg3d.jl | 11 +++++------
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/callbacks_step/save_solution.jl b/src/callbacks_step/save_solution.jl
index 71196d6fe1f..a74d374390f 100644
--- a/src/callbacks_step/save_solution.jl
+++ b/src/callbacks_step/save_solution.jl
@@ -287,10 +287,8 @@ end
                                     system = "")
     # TODO GPU currently on CPU
     backend = trixi_backend(_u_ode)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        u_ode = _u_ode
-    else
-        u_ode = Array(_u_ode)
+    if backend !== nothing
+        u_ode = Array(u_ode)
     end
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array_native(u_ode, mesh, equations, solver, cache)
diff --git a/src/callbacks_step/stepsize_dg1d.jl b/src/callbacks_step/stepsize_dg1d.jl
index e0cac1ce57c..613bf3198b2 100644
--- a/src/callbacks_step/stepsize_dg1d.jl
+++ b/src/callbacks_step/stepsize_dg1d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(backend, u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_speed::False, equations,
                 dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
@@ -29,7 +29,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -52,7 +52,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{1},
     return 4 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_speed::True, equations,
                 dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
@@ -72,7 +72,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_diffusivity::True, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -91,7 +91,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{1},
     return 4 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::StructuredMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::StructuredMesh{1},
                 constant_speed::False, equations,
                 dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
@@ -119,7 +119,7 @@ function max_dt(backend, u, t, mesh::StructuredMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::StructuredMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::StructuredMesh{1},
                 constant_speed::True, equations,
                 dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index fe30e5019b7..a5d5ba53c2a 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(backend, u, t, mesh::TreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -29,7 +29,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{2},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::TreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -82,7 +82,7 @@ function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
     return dt
 end
 
-function max_dt(backend, u, t,
+function max_dt(backend::Nothing, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::False, equations, dg::DG, cache)
@@ -120,7 +120,7 @@ function max_dt(backend, u, t,
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t,
+function max_dt(backend::Nothing, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::True, equations, dg::DG, cache)
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index 8cdc7d74487..c211f765a93 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(backend, u, t, mesh::TreeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -31,7 +31,7 @@ function max_dt(backend, u, t, mesh::TreeMesh{3},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::TreeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -135,10 +135,9 @@ function max_dt(backend, u, t,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
 
-    if backend isa Nothing  # TODO GPU KA CPU backend as well
-        @unpack contravariant_vectors, inverse_jacobian = cache.elements
-    else
-        # TODO GPU is this sufficient?
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    if backend !== nothing 
+        # TODO: Port to GPU
         contravariant_vectors = Array(cache.elements.contravariant_vectors)
         inverse_jacobian = Array(cache.elements.inverse_jacobian)
     end

From 8a98d27940a3cdd947f88f0358c35ee513e45d86 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 10:00:29 +0200
Subject: [PATCH 67/81] !fixup

---
 src/callbacks_step/save_solution.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/callbacks_step/save_solution.jl b/src/callbacks_step/save_solution.jl
index a74d374390f..12f63792281 100644
--- a/src/callbacks_step/save_solution.jl
+++ b/src/callbacks_step/save_solution.jl
@@ -280,13 +280,13 @@ end
     return nothing
 end
 
-@inline function save_solution_file(_u_ode, t, dt, iter,
+@inline function save_solution_file(u_ode, t, dt, iter,
                                     semi::AbstractSemidiscretization, solution_callback,
                                     element_variables = Dict{Symbol, Any}(),
                                     node_variables = Dict{Symbol, Any}();
                                     system = "")
     # TODO GPU currently on CPU
-    backend = trixi_backend(_u_ode)
+    backend = trixi_backend(u_ode)
     if backend !== nothing
         u_ode = Array(u_ode)
     end

From 7de1e571b08623234735d3d769e96318966e484e Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 10:12:19 +0200
Subject: [PATCH 68/81] fmt

---
 src/callbacks_step/stepsize_dg3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index c211f765a93..b3fdd3d9807 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -136,7 +136,7 @@ function max_dt(backend, u, t,
     max_scaled_speed = nextfloat(zero(t))
 
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
-    if backend !== nothing 
+    if backend !== nothing
         # TODO: Port to GPU
         contravariant_vectors = Array(cache.elements.contravariant_vectors)
         inverse_jacobian = Array(cache.elements.inverse_jacobian)

From 31a65cb2acb608de40ea63452a6e22a38a0b249d Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 11:27:38 +0200
Subject: [PATCH 69/81] pass backend through

---
 src/callbacks_step/stepsize_dg2d.jl | 16 ++++++++--------
 src/callbacks_step/stepsize_dg3d.jl | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index a5d5ba53c2a..a6c217f2885 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -55,10 +55,10 @@ function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), TreeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), TreeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -72,10 +72,10 @@ function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), TreeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), TreeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -161,10 +161,10 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -178,10 +178,10 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index b3fdd3d9807..1f67dfe7fc2 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -180,10 +180,10 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -197,10 +197,10 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -214,10 +214,10 @@ function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -231,10 +231,10 @@ function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 

From 4064e79478a7b1a452bb7f9a63fb040d9bc83e9f Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 11:28:30 +0200
Subject: [PATCH 70/81] fixes

---
 src/solvers/dgsem_p4est/dg_2d.jl   | 2 +-
 src/solvers/dgsem_structured/dg.jl | 2 +-
 src/solvers/dgsem_tree/dg_1d.jl    | 6 ++++--
 src/solvers/dgsem_tree/dg_2d.jl    | 5 +++--
 src/solvers/dgsem_tree/dg_3d.jl    | 3 ++-
 src/solvers/fdsbp_tree/fdsbp_2d.jl | 8 ++++----
 src/solvers/fdsbp_tree/fdsbp_3d.jl | 8 ++++----
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index b417e87a77d..87565720c99 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -63,7 +63,7 @@ end
     end
 end
 
-function prolong2interfaces!(cache, u,
+function prolong2interfaces!(backend::Nothing, cache, u,
                              mesh::Union{P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
                              equations, dg::DG)
     @unpack interfaces = cache
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 6cc2791c27e..17bd6dd0f20 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -35,7 +35,7 @@ function calc_boundary_flux!(cache, u, t, boundary_condition::BoundaryConditionP
     @assert isperiodic(mesh)
 end
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{StructuredMesh, StructuredMeshView{2}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index 986bc6d6830..d7e8c0e8464 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -238,7 +238,8 @@ end
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralPureLGLFiniteVolumeO2,
                                dg::DGSEM, cache)
@@ -404,7 +405,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{1}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{1}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack neighbor_ids = interfaces
     interfaces_u = interfaces.u
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 0d1b3c885b8..1d8b6f65f8d 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -120,7 +120,7 @@ function rhs!(backend, du, u, t,
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
@@ -439,7 +439,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{2}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 664a8e168ef..b04fd0f885b 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -433,7 +433,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(backend, cache, u, mesh::TreeMesh{3}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{3}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u
diff --git a/src/solvers/fdsbp_tree/fdsbp_2d.jl b/src/solvers/fdsbp_tree/fdsbp_2d.jl
index 6f642ef1ab6..132b5161e78 100644
--- a/src/solvers/fdsbp_tree/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_2d.jl
@@ -159,7 +159,7 @@ function calc_volume_integral!(backend::Nothing, du, u,
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{2},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{2},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -202,7 +202,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{2},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh2D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -260,7 +260,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{2},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{2},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -304,7 +304,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{2},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh2D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_tree/fdsbp_3d.jl b/src/solvers/fdsbp_tree/fdsbp_3d.jl
index 1eff0986e17..9fe7cd3044d 100644
--- a/src/solvers/fdsbp_tree/fdsbp_3d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_3d.jl
@@ -181,7 +181,7 @@ function calc_volume_integral!(backend::Nothing, du, u,
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{3},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{3},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -238,7 +238,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{3},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh3D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh3D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -297,7 +297,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{3},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{3},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -355,7 +355,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{3},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh3D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh3D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1

From af50cda41b961227336402bc4080f0be9a73f122 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 12:02:00 +0200
Subject: [PATCH 71/81] backends here and there

---
 src/solvers/dgsem_p4est/dg_2d.jl          |  2 +-
 src/solvers/dgsem_p4est/dg_3d_parallel.jl | 10 ++++++----
 src/solvers/dgsem_structured/dg.jl        |  3 ++-
 src/solvers/dgsem_structured/dg_2d.jl     |  2 +-
 src/solvers/dgsem_tree/dg_1d.jl           |  2 +-
 src/solvers/dgsem_tree/dg_2d.jl           | 11 ++++++-----
 src/solvers/dgsem_tree/dg_2d_parallel.jl  |  9 +++++----
 src/solvers/dgsem_tree/dg_3d.jl           |  4 ++--
 src/solvers/dgsem_unstructured/dg_2d.jl   |  3 ++-
 src/solvers/fdsbp_tree/fdsbp_2d.jl        |  2 +-
 src/solvers/fdsbp_tree/fdsbp_3d.jl        |  2 +-
 11 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index 87565720c99..56b6568072d 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -119,7 +119,7 @@ function prolong2interfaces!(backend::Nothing, cache, u,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                           T8codeMesh{2}},
                               have_nonconservative_terms,
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index 616ce759486..188560fa95f 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -40,12 +40,12 @@ function rhs!(backend, du, u, t,
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -95,11 +95,13 @@ function rhs!(backend, du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations, dg.surface_integral, dg, cache)
+        calc_surface_integral!(backend, du, u, mesh, equations, dg.surface_integral, dg,
+                               cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 17bd6dd0f20..931a5b81602 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -69,7 +69,8 @@ function rhs!(backend, du, u, t,
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index 6430b61b276..1883fa5f881 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -588,7 +588,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du,
+function apply_jacobian!(backend::Nothing, du,
                          mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                      UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
                                      T8codeMesh{2}},
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index d7e8c0e8464..30cdd500646 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -405,7 +405,7 @@ end
     return nothing
 end
 
-function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{1}, equations,
+function prolong2interfaces!(cache, u, mesh::TreeMesh{1}, equations,
                              dg::DG)
     @unpack interfaces = cache
     @unpack neighbor_ids = interfaces
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 1d8b6f65f8d..fbac4822c60 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -125,7 +125,7 @@ function rhs!(backend, du, u, t,
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -162,7 +162,8 @@ function rhs!(backend, du, u, t,
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -467,7 +468,7 @@ function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{2}, equa
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -501,7 +502,7 @@ function calc_interface_flux!(surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
@@ -1066,7 +1067,7 @@ function calc_surface_integral!(backend::Nothing, du, u,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{2},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{2},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_tree/dg_2d_parallel.jl b/src/solvers/dgsem_tree/dg_2d_parallel.jl
index b4ab0bdaaee..614af8e0da1 100644
--- a/src/solvers/dgsem_tree/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parallel.jl
@@ -484,12 +484,12 @@ function rhs!(backend, du, u, t,
     # Prolong solution to interfaces
     # TODO: Taal decide order of arguments, consistent vs. modified cache first?
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -540,12 +540,13 @@ function rhs!(backend, du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index b04fd0f885b..d181eab61fd 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -472,7 +472,7 @@ function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{3}, equa
     return nothing
 end
 
-function calc_interface_flux!(backend, surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -507,7 +507,7 @@ function calc_interface_flux!(backend, surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(backend, surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
diff --git a/src/solvers/dgsem_unstructured/dg_2d.jl b/src/solvers/dgsem_unstructured/dg_2d.jl
index b5367b45d72..91152903540 100644
--- a/src/solvers/dgsem_unstructured/dg_2d.jl
+++ b/src/solvers/dgsem_unstructured/dg_2d.jl
@@ -80,7 +80,8 @@ function rhs!(backend, du, u, t,
 
     # Apply Jacobian from mapping to reference element
     #  Note! this routine is reused from dgsem_structured/dg_2d.jl
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/fdsbp_tree/fdsbp_2d.jl b/src/solvers/fdsbp_tree/fdsbp_2d.jl
index 132b5161e78..2b08cfe7f11 100644
--- a/src/solvers/fdsbp_tree/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_2d.jl
@@ -214,7 +214,7 @@ end
 # already separates the solution information into right-traveling and
 # left-traveling information. So we only need to compute the appropriate
 # flux information at each side of an interface.
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::False, equations,
                               surface_integral::SurfaceIntegralUpwind,
diff --git a/src/solvers/fdsbp_tree/fdsbp_3d.jl b/src/solvers/fdsbp_tree/fdsbp_3d.jl
index 9fe7cd3044d..86d82fe752e 100644
--- a/src/solvers/fdsbp_tree/fdsbp_3d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_3d.jl
@@ -250,7 +250,7 @@ end
 # already separates the solution information into right-traveling and
 # left-traveling information. So we only need to compute the appropriate
 # flux information at each side of an interface.
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral::SurfaceIntegralUpwind,

From 5893d4dca815131d5e26aafaac318d9c3ea87c68 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 15:31:53 +0200
Subject: [PATCH 72/81] almost everywhere

---
 src/callbacks_step/stepsize_dg2d.jl                | 12 ++++++------
 src/callbacks_step/stepsize_dg3d.jl                |  8 ++++----
 src/solvers/dgmulti/dg.jl                          |  2 +-
 src/solvers/dgmulti/flux_differencing.jl           |  4 ++--
 src/solvers/dgmulti/flux_differencing_gauss_sbp.jl |  2 +-
 src/solvers/dgsem_structured/dg.jl                 |  2 +-
 src/solvers/dgsem_tree/dg_2d_parabolic.jl          |  2 +-
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index a6c217f2885..2691511c747 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -48,7 +48,7 @@ function max_dt(backend::Nothing, u, t, mesh::TreeMesh{2},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::ParallelTreeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::TreeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -65,7 +65,7 @@ function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelTreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::ParallelTreeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::TreeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -154,7 +154,7 @@ function max_dt(backend::Nothing, u, t,
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::ParallelP4estMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -171,7 +171,7 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::ParallelP4estMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -188,7 +188,7 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{2},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelT8codeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::ParallelT8codeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -205,7 +205,7 @@ function max_dt(backend, u, t, mesh::ParallelT8codeMesh{2},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelT8codeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::ParallelT8codeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index 1f67dfe7fc2..3f50d618fd1 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -173,7 +173,7 @@ function max_dt(backend, u, t,
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::ParallelP4estMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -190,7 +190,7 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::ParallelP4estMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -207,7 +207,7 @@ function max_dt(backend, u, t, mesh::ParallelP4estMesh{3},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::ParallelT8codeMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
@@ -224,7 +224,7 @@ function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
     return dt
 end
 
-function max_dt(backend, u, t, mesh::ParallelT8codeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::ParallelT8codeMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index 2be73e5e208..91279a461bd 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -662,7 +662,7 @@ function calc_sources!(du, u, t, source_terms,
     return nothing
 end
 
-function rhs!(du, u, t, mesh, equations,
+function rhs!(backend, du, u, t, mesh, equations,
               boundary_conditions::BC, source_terms::Source,
               dg::DGMulti, cache) where {BC, Source}
     @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
diff --git a/src/solvers/dgmulti/flux_differencing.jl b/src/solvers/dgmulti/flux_differencing.jl
index 458e06e88b6..139c4d706c5 100644
--- a/src/solvers/dgmulti/flux_differencing.jl
+++ b/src/solvers/dgmulti/flux_differencing.jl
@@ -616,7 +616,7 @@ end
 # an entropy conservative/stable discretization. For modal DG schemes, an extra `entropy_projection!`
 # is required (see https://doi.org/10.1016/j.jcp.2018.02.033, Section 4.3).
 # Also called by DGMultiFluxDiff{<:GaussSBP} solvers.
-function rhs!(du, u, t, mesh, equations, boundary_conditions::BC,
+function rhs!(backend, du, u, t, mesh, equations, boundary_conditions::BC,
               source_terms::Source, dg::DGMultiFluxDiff, cache) where {Source, BC}
     @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
 
@@ -661,7 +661,7 @@ end
 # integral, e.g., an entropy conservative/stable discretization. The implementation of `rhs!`
 # for such schemes is very similar to the implementation of `rhs!` for standard DG methods,
 # but specializes `calc_volume_integral`.
-function rhs!(du, u, t, mesh, equations,
+function rhs!(backend, du, u, t, mesh, equations,
               boundary_conditions::BC, source_terms::Source,
               dg::DGMultiFluxDiffSBP, cache) where {BC, Source}
     @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
diff --git a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
index cb06a40009a..f9d13334a11 100644
--- a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
+++ b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
@@ -582,7 +582,7 @@ end
 
 # Specialize RHS so that we can call `invert_jacobian_and_interpolate!` instead of just `invert_jacobian!`,
 # since `invert_jacobian!` is also used in other places (e.g., parabolic terms).
-function rhs!(du, u, t, mesh, equations, boundary_conditions::BC,
+function rhs!(backend, du, u, t, mesh, equations, boundary_conditions::BC,
               source_terms::Source, dg::DGMultiFluxDiff{<:GaussSBP},
               cache) where {Source, BC}
     @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 931a5b81602..b661c2bbd02 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -44,7 +44,7 @@ function rhs!(backend, du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
diff --git a/src/solvers/dgsem_tree/dg_2d_parabolic.jl b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
index 35f259ca9e5..ed2ba183454 100644
--- a/src/solvers/dgsem_tree/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
@@ -835,7 +835,7 @@ function calc_gradient!(gradients, u_transformed, t,
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache_parabolic, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache_parabolic, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 

From a1caa12dc35bfd4820ee225bef159ddf93db3966 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 17:19:48 +0200
Subject: [PATCH 73/81] some more

---
 src/callbacks_step/stepsize_dg2d.jl        | 8 ++++----
 src/solvers/dgsem_p4est/dg_2d_parabolic.jl | 2 +-
 src/solvers/dgsem_p4est/dg_3d_parabolic.jl | 2 +-
 src/solvers/dgsem_structured/dg.jl         | 2 +-
 src/solvers/dgsem_tree/dg_1d.jl            | 7 ++++---
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index 2691511c747..a1b5eda6e30 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -195,10 +195,10 @@ function max_dt(backend::Nothing, u, t, mesh::ParallelT8codeMesh{2},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -212,10 +212,10 @@ function max_dt(backend::Nothing, u, t, mesh::ParallelT8codeMesh{2},
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 7d263b5fa2e..4f43c041637 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -220,7 +220,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Prolong solution to interfaces.
     # This reuses `prolong2interfaces` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache_parabolic, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache_parabolic, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
index 6703d3014de..ff0cff761cc 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
@@ -114,7 +114,7 @@ function calc_gradient!(gradients, u_transformed, t,
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache_parabolic, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache_parabolic, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index b661c2bbd02..8828c32666f 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -64,7 +64,7 @@ function rhs!(backend, du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index 30cdd500646..57ecf8efc9c 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -103,7 +103,7 @@ function rhs!(backend, du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
@@ -613,7 +613,8 @@ function calc_boundary_flux_by_direction!(surface_flux_values::AbstractArray{<:A
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                                 equations, surface_integral, dg::DGSEM, cache)
     @unpack boundary_interpolation = dg.basis
     @unpack surface_flux_values = cache.elements
@@ -639,7 +640,7 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1
     return nothing
 end
 
-function apply_jacobian!(du, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function apply_jacobian!(backend::Nothing, du, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 

From a5cded3ae7cf1e94d0a19097d70b9ef2b16a2d55 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Wed, 8 Oct 2025 21:35:15 +0200
Subject: [PATCH 74/81] next round

---
 src/solvers/dgsem_p4est/dg_2d_parabolic.jl | 4 ++--
 src/solvers/dgsem_p4est/dg_3d_parabolic.jl | 2 +-
 src/solvers/dgsem_tree/dg_1d.jl            | 2 +-
 src/solvers/dgsem_tree/dg_1d_parabolic.jl  | 2 +-
 src/solvers/fdsbp_tree/fdsbp_1d.jl         | 8 ++++----
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 4f43c041637..2ecd0025ef8 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -138,7 +138,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache_parabolic)
     end
 
@@ -227,7 +227,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Calculate interface fluxes for the gradient.
     # This reuses `calc_interface_flux!` for the purely hyperbolic case.
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache_parabolic.elements.surface_flux_values,
+        calc_interface_flux!(nothing, cache_parabolic.elements.surface_flux_values,
                              mesh, False(), # False() = no nonconservative terms
                              equations_parabolic, dg.surface_integral, dg,
                              cache_parabolic)
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
index ff0cff761cc..34bfe1fa908 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
@@ -121,7 +121,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Calculate interface fluxes for the gradient. This reuses P4est `calc_interface_flux!` along with a
     # specialization for AbstractEquationsParabolic.
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache_parabolic.elements.surface_flux_values,
+        calc_interface_flux!(nothing, cache_parabolic.elements.surface_flux_values,
                              mesh, False(), # False() = no nonconservative terms
                              equations_parabolic, dg.surface_integral, dg,
                              cache_parabolic)
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index 57ecf8efc9c..7c5878b0dc1 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -108,7 +108,7 @@ function rhs!(backend, du, u, t,
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg, cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_tree/dg_1d_parabolic.jl b/src/solvers/dgsem_tree/dg_1d_parabolic.jl
index 06a6a4488ec..faa9a7240a4 100644
--- a/src/solvers/dgsem_tree/dg_1d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_1d_parabolic.jl
@@ -90,7 +90,7 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache_parabolic)
     end
 
diff --git a/src/solvers/fdsbp_tree/fdsbp_1d.jl b/src/solvers/fdsbp_tree/fdsbp_1d.jl
index 6e71d7627d9..ceebd104f43 100644
--- a/src/solvers/fdsbp_tree/fdsbp_1d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_1d.jl
@@ -139,7 +139,7 @@ function calc_volume_integral!(backend::Nothing, du, u,
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{1},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{1},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -166,7 +166,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{1},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh1D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh1D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -220,7 +220,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{1},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{1},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -248,7 +248,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{1},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh1D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh1D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1

From 7c6ab4a571b2d0b7ac72a7cb2dac6ec8c64104b3 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 9 Oct 2025 08:53:43 +0200
Subject: [PATCH 75/81] could this be...

---
 src/solvers/dgsem_p4est/dg_3d_parabolic.jl | 11 ++++++-----
 src/solvers/dgsem_tree/dg_1d.jl            |  6 ++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
index 34bfe1fa908..8d7049d37e6 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
@@ -352,21 +352,22 @@ end
 end
 
 # This version is used for parabolic gradient computations
-@inline function calc_interface_flux!(surface_flux_values, mesh::P4estMesh{3},
+@inline function calc_interface_flux!(surface_flux_values,
+                                      ::Type{<:Union{P4estMesh{3}}},
                                       have_nonconservative_terms::False,
                                       equations::AbstractEquationsParabolic,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
-                                       primary_j_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index, primary_j_node_index,
                                        interface_index)
 
     flux_ = 0.5f0 * (u_ll + u_rr) # we assume that the gradient computations utilize a central flux
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index 7c5878b0dc1..f594ea7eb08 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -108,7 +108,8 @@ function rhs!(backend, du, u, t,
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -640,7 +641,8 @@ function calc_surface_integral!(backend::Nothing, du, u,
     return nothing
 end
 
-function apply_jacobian!(backend::Nothing, du, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function apply_jacobian!(backend::Nothing, du,
+                         mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 

From 719c2d15cfb7f250b6af8e031d3cc6d7377a54b2 Mon Sep 17 00:00:00 2001
From: Vivienne Ehlert <201612348+vivimie@users.noreply.github.com>
Date: Thu, 6 Nov 2025 16:01:55 +0100
Subject: [PATCH 76/81] adapts until 2d prolong2interfaces!

---
 src/solvers/dgsem_p4est/dg_2d.jl      | 111 +++++++++++++++++---------
 src/solvers/dgsem_structured/dg_2d.jl |  62 ++++++++++++--
 2 files changed, 127 insertions(+), 46 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index 56b6568072d..b1c5d932b3e 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -64,56 +64,91 @@ end
 end
 
 function prolong2interfaces!(backend::Nothing, cache, u,
-                             mesh::Union{P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+                             mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                         T8codeMesh{2}},
                              equations, dg::DG)
     @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Copy solution data from the primary element using "delayed indexing" with
-        # a start value and a step size to get the correct face and orientation.
-        # Note that in the current implementation, the interface will be
-        # "aligned at the primary element", i.e., the index of the primary side
-        # will always run forwards.
-        primary_element = interfaces.neighbor_ids[1, interface]
-        primary_indices = interfaces.node_indices[1, interface]
+       prolong2interfaces_interface!(interfaces.u, u, interface, typeof(mesh),
+                                     equations, neighbor_ids, node_indices,
+                                     index_range)
+    end
+    return nothing
+end
 
-        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
-                                                                 index_range)
-        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
-                                                                 index_range)
+function prolong2interfaces!(backend::Backend, cache, u,
+                             mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                         T8codeMesh{2}},
+                             equations, dg::DG)
+    @unpack interfaces = cache
+    ninterfaces(interfaces) == 0 && return nothing
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    index_range = eachnode(dg)
 
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        for i in eachnode(dg)
-            for v in eachvariable(equations)
-                interfaces.u[1, v, i, interface] = u[v, i_primary, j_primary,
-                                                     primary_element]
-            end
-            i_primary += i_primary_step
-            j_primary += j_primary_step
+    kernel! = prolong2interfaces_KAkernel!(backend)
+    kernel!(interfaces.u, u, typeof(mesh), equations, neighbor_ids, node_indices,
+            index_range, ndrange = ninterfaces(interfaces))
+    return nothing
+end
+
+@kernel function prolong2interfaces_KAkernel!(interfaces_u, u,
+                                              mT::Type{<:Union{P4estMesh{2},
+                                                               P4estMeshView{2},
+                                                               T8codeMesh{2}}},
+                                              equations, neighbor_ids,
+                                              node_indices, index_range)
+    interface = @index(Global)
+    prolong2interfaces_per_interface!(interfaces_u, u, interface, mT, equations,
+                                      neighbor_ids, node_indices, index_range)
+end
+
+function prolong2interfaces_per_interface!(interfaces_u, u, interface,
+                                           ::Type{<:Union{P4estMesh{2},
+                                                          P4estMeshView{2},
+                                                          T8codeMesh{2}}},
+                                           equations, neighbor_ids, node_indices,
+                                           index_range)
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+
+    i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                             index_range)
+    j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                             index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    for i in index_range
+        for v in eachvariable(equations)
+            interfaces_u[1, v, i, interface] = u[v, i_primary, j_primary,
+                                                 primary_element]
         end
+        i_primary += i_primary_step
+        j_primary += j_primary_step
+    end
 
-        # Copy solution data from the secondary element using "delayed indexing" with
-        # a start value and a step size to get the correct face and orientation.
-        secondary_element = interfaces.neighbor_ids[2, interface]
-        secondary_indices = interfaces.node_indices[2, interface]
+    # Copy solution data from the secondary element using "delayed indexing" with
+    # a start value and a step size to get the correct face and orientation.
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
 
-        i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
-                                                                     index_range)
-        j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
-                                                                     index_range)
+    i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
+                                                                 index_range)
+    j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
+                                                                 index_range)
 
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
-        for i in eachnode(dg)
-            for v in eachvariable(equations)
-                interfaces.u[2, v, i, interface] = u[v, i_secondary, j_secondary,
-                                                     secondary_element]
-            end
-            i_secondary += i_secondary_step
-            j_secondary += j_secondary_step
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+    for i in index_range
+        for v in eachvariable(equations)
+            interfaces_u[2, v, i, interface] = u[v, i_secondary, j_secondary,
+                                                 secondary_element]
         end
+        i_secondary += i_secondary_step
+        j_secondary += j_secondary_step
     end
 
     return nothing
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index 1883fa5f881..bfeaab65c7d 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -5,6 +5,50 @@
 @muladd begin
 #! format: noindent
 
+function calc_volume_integral!(::Nothing, du, u,
+                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                           UnstructuredMesh2D, P4estMesh{2},
+                                           P4estMeshView{2}, T8codeMesh{2}},
+                               have_nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralWeakForm,
+                               dg::DGSEM, cache)
+    @unpack contravariant_vectors = cache.elements
+    @threaded for element in eachelement(dg, cache)
+        weak_form_kernel_per_element!(du, u, element, typeof(mesh),
+                                      have_nonconservative_terms, equations, dg,
+                                      contravariant_vectors)
+    end
+    return nothing
+end
+
+function calc_volume_integral!(backend::Backend, du, u,
+                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                           UnstructuredMesh2D, P4estMesh{2},
+                                           P4estMeshView{2}, T8codeMesh{2}},
+                               have_nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralWeakForm,
+                               dg::DGSEM, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack contravariant_vectors = cache.elements
+    kernel! = weak_form_KAkernel!(backend)
+    kernel!(du, u, typeof(mesh), have_nonconservative_terms, equations, dg,
+            contravariant_vectors, ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function weak_form_KAkernel!(du, u,
+                                     mT::Type{<:Union{StructuredMesh{2},
+                                                      StructuredMeshView{2},
+                                                      UnstructuredMesh2D,
+                                                      P4estMesh{2},
+                                                      P4estMeshView{2},
+                                                      T8codeMesh{2}}},
+                                     have_nonconservative_terms, equations,
+                                     dg::DGSEM, contravariant_vectors)
+    element = @index(Global)
+    weak_form_kernel_per_element!(du, u, element, mT, have_nonconservative_terms,
+                                  equations, dg, contravariant_vectors)
+end
 #=
 `weak_form_kernel!` is only implemented for conserved terms as
 non-conservative terms should always be discretized in conjunction with a flux-splitting scheme,
@@ -12,17 +56,19 @@ see `flux_differencing_kernel!`.
 This treatment is required to achieve, e.g., entropy-stability or well-balancedness.
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
-@inline function weak_form_kernel!(du, u,
-                                   element,
-                                   mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                               UnstructuredMesh2D, P4estMesh{2},
-                                               P4estMeshView{2}, T8codeMesh{2}},
-                                   have_nonconservative_terms::False, equations,
-                                   dg::DGSEM, cache, alpha = true)
+@inline function weak_form_kernel_per_element!(du, u, element,
+                                               ::Type{<:Union{StructuredMesh{2},
+                                                              StructuredMeshView{2},
+                                                              UnstructuredMesh2D,
+                                                              P4estMesh{2},
+                                                              P4estMeshView{2},
+                                                              T8codeMesh{2}}},
+                                               have_nonconservative_terms::False,
+                                               equations, dg::DGSEM,
+                                               contravariant_vectors, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
     # This can (hopefully) be optimized away due to constant propagation.
     @unpack derivative_dhat = dg.basis
-    @unpack contravariant_vectors = cache.elements
 
     for j in eachnode(dg), i in eachnode(dg)
         u_node = get_node_vars(u, equations, dg, i, j, element)

From 6bbc069a9503c0678df705543fc8497b1ba998a4 Mon Sep 17 00:00:00 2001
From: Vivienne Ehlert <201612348+vivimie@users.noreply.github.com>
Date: Thu, 6 Nov 2025 16:04:03 +0100
Subject: [PATCH 77/81] adds explicit mesh type in signature

---
 src/solvers/dgsem_structured/dg_3d.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index cd39623a367..50772a4d1c2 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -36,7 +36,11 @@ function calc_volume_integral!(backend::Backend, du, u,
     return nothing
 end
 
-@kernel function weak_form_KAkernel!(du, u, meshT, have_nonconservative_terms,
+@kernel function weak_form_KAkernel!(du, u,
+                                     meshT::Type{<:Union{StructuredMesh{3},
+                                                         P4estMesh{3},
+                                                         T8codeMesh{3}}},
+                                     have_nonconservative_terms,
                                      equations,
                                      dg::DGSEM, contravariant_vectors)
     element = @index(Global)

From e58c2985ab1e99397e0138f57906cb9387e80ed8 Mon Sep 17 00:00:00 2001
From: Vivienne Ehlert <201612348+vivimie@users.noreply.github.com>
Date: Fri, 7 Nov 2025 16:38:15 +0100
Subject: [PATCH 78/81] adapts the rest for the 2d basic advection gpu elixir

---
 src/callbacks_step/stepsize_dg2d.jl   | 163 ++++++++++----
 src/solvers/dgsem_p4est/containers.jl |  12 ++
 src/solvers/dgsem_p4est/dg_2d.jl      | 299 +++++++++++++++++---------
 src/solvers/dgsem_structured/dg_2d.jl |  45 +++-
 4 files changed, 367 insertions(+), 152 deletions(-)

diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index a1b5eda6e30..8c5560f3b9d 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -81,7 +81,6 @@ function max_dt(backend::Nothing, u, t, mesh::ParallelTreeMesh{2},
 
     return dt
 end
-
 function max_dt(backend::Nothing, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             T8codeMesh{2}, StructuredMeshView{2}},
@@ -89,69 +88,145 @@ function max_dt(backend::Nothing, u, t,
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
+    @unpack contravariant_vectors, inverse_jacobian = cache
+    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
+        max_lambda = max_scaled_speed_per_element(u, typeof(mesh), equations, dg,
+                                                  contravariant_vectors,
+                                                  inverse_jacobian, element)
+        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
+        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
+        max_scaled_speed = Base.max(max_scaled_speed, max_lambda)
+    end
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
+function max_dt(backend::Backend, u, t,
+                mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
+                            T8codeMesh{2}, StructuredMeshView{2}},
+                constant_speed::False, equations, dg::DG, cache)
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    num_elements = nelements(dg,cache)
+    max_scaled_speeds = allocate(backend, eltype(t), num_elements)
+
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
+            contravariant_vectors, inverse_jacobian, ndrange = num_elements)
+    # TODO GPU dt on CPU? (time integration happens on CPU)
+    max_scaled_speed = max(nextfloat(zero(t)), maximum(max_scaled_speeds))
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        max_lambda1 = max_lambda2 = zero(max_scaled_speed)
-        for j in eachnode(dg), i in eachnode(dg)
-            u_node = get_node_vars(u, equations, dg, i, j, element)
-            lambda1, lambda2 = max_abs_speeds(u_node, equations)
-
-            # Local speeds transformed to the reference element
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
-            lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2)
+# works for both constant and non-constant speed
+@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u,
+                                            mT::Type{<:Union{StructuredMesh{2},
+                                                             UnstructuredMesh2D,
+                                                             P4estMesh{2},
+                                                             T8codeMesh{2},
+                                                             StructuredMeshView{2}}},
+                                            constant_speed, equations,
+                                            dg::DG, contravariant_vectors,
+                                            inverse_jacobian)
+    element = @index(Global)
+    max_scaled_speeds[element] = max_scaled_speed_per_element(u, mT, constant_speed,
+                                                              equations, dg,
+                                                              contravariant_vectors,
+                                                              inverse_jacobian,
+                                                              element)
+end
 
-            inv_jacobian = abs(inverse_jacobian[i, j, element])
+function max_scaled_speed_per_element(u,
+                                      mT::Type{<:Union{StructuredMesh{2},
+                                                       UnstructuredMesh2D,
+                                                       P4estMesh{2}, T8codeMesh{2},
+                                                       StructuredMeshView{2}}},
+                                      constant_speed::False, equations, dg::DG,
+                                      contravariant_vectors, inverse_jacobian,
+                                      element)
+    max_lambda1 = max_lambda2 = zero(max_scaled_speed)
+    for j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, element)
+        lambda1, lambda2 = max_abs_speeds(u_node, equations)
+
+        # Local speeds transformed to the reference element
+        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                              i, j, element)
+        lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2)
+        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, j, element)
+        lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, element])
+
+        max_lambda1 = Base.max(max_lambda1, lambda1_transformed * inv_jacobian)
+        max_lambda2 = Base.max(max_lambda2, lambda2_transformed * inv_jacobian)
+    end
+    return max_lambda1 + max_lambda2
+end
 
-            max_lambda1 = Base.max(max_lambda1, lambda1_transformed * inv_jacobian)
-            max_lambda2 = Base.max(max_lambda2, lambda2_transformed * inv_jacobian)
-        end
+function max_dt(backend::Nothing, u, t,
+                mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
+                            P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
+                constant_speed::True, equations, dg::DG, cache)
+    max_scaled_speed = nextfloat(zero(t))
 
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
+        max_lambda = max_scaled_speed_per_element(u, typeof(mesh), constant_speed,
+                                                 equations, dg, contravariant_vectors,
+                                                 inverse_jacobian, element)
         # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
         # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-        max_scaled_speed = Base.max(max_scaled_speed, max_lambda1 + max_lambda2)
+        max_scaled_speed = Base.max(max_scaled_speed, max_lambda)
     end
 
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(backend::Nothing, u, t,
+function max_dt(backend::Backend, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::True, equations, dg::DG, cache)
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    num_elements = nelements(dg,cache)
+    max_scaled_speeds = allocate(backend, eltype(t), num_elements)
+
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
+            contravariant_vectors, inverse_jacobian, ndrange = num_elements)
+    # TODO GPU dt on CPU? (time integration happens on CPU)
+    max_scaled_speed = max(nextfloat(zero(t)), maximum(max_scaled_speeds))
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-    # to avoid a division by zero if the speed vanishes everywhere,
-    # e.g. for steady-state linear advection
-    max_scaled_speed = nextfloat(zero(t))
-
+function max_scaled_speed_per_element(u,
+                                      ::Type{<:Union{StructuredMesh{2},
+                                                     UnstructuredMesh2D,
+                                                     P4estMesh{2},
+                                                     P4estMeshView{2},
+                                                     T8codeMesh{2},
+                                                     StructuredMeshView{2}}},
+                                      constant_speed::True, equations, dg::DG,
+                                      contravariant_vectors, inverse_jacobian,
+                                      element)
+
+    max_lambda1_loc = max_lambda2_loc = nextfloat(zero(eltype(u)))
     max_lambda1, max_lambda2 = max_abs_speeds(equations)
-
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            # Local speeds transformed to the reference element
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
-            lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2)
-
-            inv_jacobian = abs(inverse_jacobian[i, j, element])
-            # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-            # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-            max_scaled_speed = Base.max(max_scaled_speed,
-                                        inv_jacobian *
-                                        (lambda1_transformed + lambda2_transformed))
-        end
+    for j in eachnode(dg), i in eachnode(dg)
+        # Local speeds transformed to the reference element
+        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                              i, j, element)
+        lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2)
+        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, j, element)
+        lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, element])
+
+        max_lambda1_loc = max(max_lambda1_loc, inv_jacobian * lambda1_transformed)
+        max_lambda2_loc = max(max_lambda2_loc, inv_jacobian * lambda2_transformed)
     end
-
-    return 2 / (nnodes(dg) * max_scaled_speed)
+    
+    return max_lambda1_loc + max_lambda2_loc
 end
 
 function max_dt(backend::Nothing, u, t, mesh::ParallelP4estMesh{2},
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index 3f74f699f19..836805bbf86 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -933,6 +933,18 @@ end
     end
 end
 
+@inline function indices2direction2d(indices)
+    if indices[1] === :begin
+        return 1
+    elseif indices[1] === :end
+        return 2
+    elseif indices[2] === :begin
+        return 3
+    else # if indices[2] === :end
+        return 4
+    end
+end
+
 include("containers_2d.jl")
 include("containers_3d.jl")
 include("containers_parallel.jl")
diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index b1c5d932b3e..3b587df1fc4 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -159,84 +159,145 @@ function calc_interface_flux!(backend::Nothing, surface_flux_values,
                                           T8codeMesh{2}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
+
     @unpack neighbor_ids, node_indices = cache.interfaces
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
-    index_end = last(index_range)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Get element and side index information on the primary element
-        primary_element = neighbor_ids[1, interface]
-        primary_indices = node_indices[1, interface]
-        primary_direction = indices2direction(primary_indices)
+        calc_interface_flux_per_interface!(surface_flux_values, typeof(mesh),
+                                           have_nonconservative_terms,
+                                           equations, surface_integral, typeof(dg),
+                                           interface, cache.interfaces.u,
+                                           neighbor_ids, node_indices,
+                                           contravariant_vectors, index_range)
+    end
 
-        # Create the local i,j indexing on the primary element used to pull normal direction information
-        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
-                                                                 index_range)
-        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
-                                                                 index_range)
+    return nothing
+end
 
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-
-        # Get element and side index information on the secondary element
-        secondary_element = neighbor_ids[2, interface]
-        secondary_indices = node_indices[2, interface]
-        secondary_direction = indices2direction(secondary_indices)
-
-        # Initiate the secondary index to be used in the surface for loop.
-        # This index on the primary side will always run forward but
-        # the secondary index might need to run backwards for flipped sides.
-        if :i_backward in secondary_indices
-            node_secondary = index_end
-            node_secondary_step = -1
-        else
-            node_secondary = 1
-            node_secondary_step = 1
-        end
+function calc_interface_flux!(backend::Backend, surface_flux_values,
+                              mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                          T8codeMesh{2}},
+                              have_nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
 
-        for node in eachnode(dg)
-            # Get the normal direction on the primary element.
-            # Contravariant vectors at interfaces in negative coordinate direction
-            # are pointing inwards. This is handled by `get_normal_direction`.
-            normal_direction = get_normal_direction(primary_direction,
-                                                    contravariant_vectors,
-                                                    i_primary, j_primary,
-                                                    primary_element)
-
-            calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
-                                 equations,
-                                 surface_integral, dg, cache,
-                                 interface, normal_direction,
-                                 node, primary_direction, primary_element,
-                                 node_secondary, secondary_direction, secondary_element)
-
-            # Increment primary element indices to pull the normal direction
-            i_primary += i_primary_step
-            j_primary += j_primary_step
-            # Increment the surface node index along the secondary element
-            node_secondary += node_secondary_step
-        end
+    ninterfaces(cache.interfaces) == 0 && return nothing
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    kernel! = calc_interface_flux_KAkernel!(backend)
+    kernel!(surface_flux_values, typeof(mesh), have_nonconservative_terms,
+            equations, surface_integral, typeof(dg), cache.interfaces.u,
+            neighbor_ids, node_indices, contravariant_vectors, index_range,
+            ndrange=ninterfaces(cache.interfaces))
+
+    return nothing
+end
+
+@kernel function calc_interface_flux_KAkernel!(surface_flux_values,
+                                               mt::Type{<:Union{P4estMesh{2},
+                                                                P4estMeshView{2},
+                                                                T8codeMesh{2}}},
+                                               have_nonconservative_terms,
+                                               equations, surface_integral,
+                                               st::Type{<:DG}, u_interface,
+                                               neighbor_ids, node_indices,
+                                               contravariant_vectors, index_range)
+    interface = @index(Global)
+    calc_interface_flux_per_interface!(surface_flux_values, mt,
+                                       have_nonconservative_terms, equations,
+                                       surface_integral, st, u_interface,
+                                       interface, neighbor_ids, node_indices,
+                                       contravariant_vectors, index_range)
+end
+
+function calc_interface_flux_per_interface!(surface_flux_values,
+                                            mt::Type{<:Union{P4estMesh{2},
+                                                             P4estMeshView{2},
+                                                             T8codeMesh{2}}},
+                                            have_nonconservative_terms,
+                                            equations, surface_integral, st::Type{<:DG},
+                                            u_interface, interface, neighbor_ids,
+                                            node_indices, contravariant_vectors,
+                                            index_range)
+    index_end = last(index_range)
+
+    # Get element and side index information on the primary element
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+    primary_direction = indices2direction2d(primary_indices)
+
+    # Create the local i,j indexing on the primary element used to pull normal direction information
+    i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                             index_range)
+    j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                             index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+
+    # Get element and side index information on the secondary element
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+    secondary_direction = indices2direction2d(secondary_indices)
+
+    # Initiate the secondary index to be used in the surface for loop.
+    # This index on the primary side will always run forward but
+    # the secondary index might need to run backwards                                               for flipped sides.
+    if :i_backward in secondary_indices
+        node_secondary = index_end
+        node_secondary_step = -1
+    else
+        node_secondary = 1
+        node_secondary_step = 1
+    end
+
+    for node in index_range
+        # Get the normal direction on the primary element.
+        # Contravariant vectors at interfaces in negative coordinate direction
+        # are pointing inwards. This is handled by `get_normal_direction`.
+        normal_direction = get_normal_direction(primary_direction,
+                                                contravariant_vectors,
+                                                i_primary, j_primary,
+                                                primary_element)
+
+        calc_interface_flux!(surface_flux_values, mt, have_nonconservative_terms,
+                             equations, surface_integral, st, u_interface, interface,
+                             normal_direction, node, primary_direction,
+                             primary_element, node_secondary,
+                             secondary_direction, secondary_element)
+
+        # Increment primary element indices to pull the normal direction
+        i_primary += i_primary_step
+        j_primary += j_primary_step
+        # Increment the surface node index along the secondary element
+        node_secondary += node_secondary_step
     end
+    
 
     return nothing
 end
 
 # Inlined version of the interface flux computation for conservation laws
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, P4estMeshView{2},
-                                                  T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2},
+                                                     P4estMeshView{2},
+                                                     T8codeMesh{2}}},
                                       have_nonconservative_terms::False, equations,
-                                      surface_integral, dg::DG, cache,
-                                      interface_index, normal_direction,
-                                      primary_node_index, primary_direction_index,
+                                      surface_integral, st::Type{<:DG},
+                                      u_interface, interface_index,
+                                      normal_direction, primary_node_index,
+                                      primary_direction_index,
                                       primary_element_index,
-                                      secondary_node_index, secondary_direction_index,
+                                      secondary_node_index,
+                                      secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st,
+                                       primary_node_index,
                                        interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -251,18 +312,19 @@ end
 
 # Inlined version of the interface flux computation for equations with conservative and nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True, equations,
-                                      surface_integral, dg::DG, cache,
-                                      interface_index, normal_direction,
-                                      primary_node_index, primary_direction_index,
+                                      surface_integral, st::Type{<:DG},
+                                      u_interface, interface_index,
+                                      normal_direction, primary_node_index,
+                                      primary_direction_index,
                                       primary_element_index,
-                                      secondary_node_index, secondary_direction_index,
+                                      secondary_node_index,
+                                      secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     surface_flux, nonconservative_flux = surface_integral.surface_flux
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st, primary_node_index,
                                        interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -276,12 +338,8 @@ end
         # Note the factor 0.5 necessary for the nonconservative fluxes based on
         # the interpretation of global SBP operators coupled discontinuously via
         # central fluxes/SATs
-        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = (flux_[v] +
-                                                                                                      0.5f0 *
-                                                                                                      noncons_primary[v])
-        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = -(flux_[v] +
-                                                                                                             0.5f0 *
-                                                                                                             noncons_secondary[v])
+        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = Float64(flux_[v] + 0.5f0 * noncons_primary[v])
+        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = Float64(-(flux_[v] + 0.5f0 * noncons_secondary[v]))
     end
 
     return nothing
@@ -682,47 +740,86 @@ end
     return nothing
 end
 
+
 function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                             T8codeMesh{2}},
                                 equations,
                                 surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
-    @unpack boundary_interpolation = dg.basis
     @unpack surface_flux_values = cache.elements
 
-    # Note that all fluxes have been computed with outward-pointing normal vectors.
-    # Access the factors only once before beginning the loop to increase performance.
-    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
-    # into FMAs (see comment at the top of the file).
-    factor_1 = boundary_interpolation[1, 1]
-    factor_2 = boundary_interpolation[nnodes(dg), 2]
     @threaded for element in eachelement(dg, cache)
-        for l in eachnode(dg)
-            for v in eachvariable(equations)
-                # surface at -x
-                du[v, 1, l, element] = (du[v, 1, l, element] +
-                                        surface_flux_values[v, l, 1, element] *
-                                        factor_1)
-
-                # surface at +x
-                du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
-                                                 surface_flux_values[v, l, 2, element] *
-                                                 factor_2)
-
-                # surface at -y
-                du[v, l, 1, element] = (du[v, l, 1, element] +
-                                        surface_flux_values[v, l, 3, element] *
-                                        factor_1)
-
-                # surface at +y
-                du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
-                                                 surface_flux_values[v, l, 4, element] *
-                                                 factor_2)
-            end
-        end
+        calc_surface_integral_per_element(du, typeof(mesh), equations,
+                                          surface_integral, dg,
+                                          surface_flux_values, element)
     end
+end
 
+function calc_surface_integral!(backend::Backend, du, u,
+                                mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                            T8codeMesh{2}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    nelements(dg,cache) == 0 && return nothing
+    @unpack surface_flux_values = cache.elements
+
+    kernel! = calc_surface_integral_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, surface_integral, dg,
+            surface_flux_values, ndrange=nelements(dg,cache))
+    return nothing
+end
+
+@kernel function calc_surface_integral_KAkernel!(du,
+                                                 mT::Type{<:Union{P4estMesh{2},
+                                                                  P4estMeshView{2},
+                                                                  T8codeMesh{2}}},
+                                                 equations,
+                                                 surface_integral::SurfaceIntegralWeakForm,
+                                                 dg::DGSEM,
+                                                 surface_flux_values)
+    element = @index(Global)
+    calc_surface_integral_per_element!(du, mT, equations, surface_integral,
+                                       dg, surface_flux_values, element)
+end
+
+function calc_surface_integral_per_element!(du, ::Type{<:Union{P4estMesh{2},
+                                                               P4estMeshView{2},
+                                                               T8codeMesh{2}}},
+                                            equations,
+                                            surface_integral::SurfaceIntegralWeakForm,
+                                            dg::DGSEM, surface_flux_values,
+                                            element)
+    # Note that all fluxes have been computed with outward-pointing normal vectors.
+    # Access the factors only once before beginning the loop (outside this function) 
+    # to increase performance. We also use explicit assignments instead of `+=`
+    # to let `@muladd` turn these into FMAs (see comment at the top of the file).
+    factor_1 = dg.basis.boundary_interpolation[1, 1]
+    factor_2 = dg.basis.boundary_interpolation[nnodes(dg), 2]
+    for l in eachnode(dg)
+        for v in eachvariable(equations)
+            # surface at -x
+            du[v, 1, l, element] = (du[v, 1, l, element] +
+                                    surface_flux_values[v, l, 1, element] *
+                                    factor_1)
+
+            # surface at +x
+            du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
+                                           surface_flux_values[v, l, 2, element] *
+                                           factor_2)
+
+            # surface at -y
+            du[v, l, 1, element] = (du[v, l, 1, element] +
+                                    surface_flux_values[v, l, 3, element] *
+                                    factor_1)
+
+            # surface at +y
+            du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
+                                           surface_flux_values[v, l, 4, element] *
+                                           factor_2)
+        end
+    end
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index bfeaab65c7d..6967a05a9d1 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -640,17 +640,48 @@ function apply_jacobian!(backend::Nothing, du,
                                      T8codeMesh{2}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
+    @threaded for element in eachelement(dg,cache)
+        apply_jacobian_per_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
+                                    element)
+    end
+end
 
-    @threaded for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            factor = -inverse_jacobian[i, j, element]
+function apply_jacobian!(backend::Backend, du,
+                         mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                     UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
+                                     T8codeMesh{2}},
+                         equations, dg::DG, cache)
+    nelements(dg,cache) == 0 && return nothing
+    @unpack inverse_jacobian = cache.elements
+    kernel! = apply_jacobian_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
+            ndrange=nelements(dg,cache))
+end
 
-            for v in eachvariable(equations)
-                du[v, i, j, element] *= factor
-            end
+@kernel function apply_jacobian_KAkernel!(du, mT::Type{<:Union{StructuredMesh{2},
+                                                               StructuredMeshView{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2},
+                                                               P4estMeshView{2},
+                                                               T8codeMesh{2}}},
+                                          equations, dg::DG, inverse_jacobian)
+    element = @index(Global)
+    apply_jacobian_per_element!(du, mT, equations, dg, inverse_jacobian, element)
+end
+
+function apply_jacobian_per_element!(du,
+                                     ::Type{<:Union{StructuredMesh{2},
+                                                    StructuredMeshView{2},
+                                                    UnstructuredMesh2D, P4estMesh{2},
+                                                    P4estMeshView{2}, T8codeMesh{2}}},
+                                     equations, dg::DG, inverse_jacobian, element)
+    for j in eachnode(dg), i in eachnode(dg)
+        factor = -inverse_jacobian[i, j, element]
+
+        for v in eachvariable(equations)
+            du[v, i, j, element] *= factor
         end
     end
-
     return nothing
 end
 end # @muladd

From b59239b4c90ca1ce9739acdf007a45fcb691d279 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 27 Nov 2025 10:05:45 +0100
Subject: [PATCH 79/81] enable 2D CUDA tests

---
 test/test_cuda_2d.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/test_cuda_2d.jl b/test/test_cuda_2d.jl
index 1e20b22c34a..c13c0a4af2b 100644
--- a/test/test_cuda_2d.jl
+++ b/test/test_cuda_2d.jl
@@ -42,12 +42,11 @@ end
     using CUDA
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
-                        l2=nothing,   # TODO: GPU. [Float32(8.311947673061856e-6)],
-                        linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)],
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
                         RealT=Float32,
                         real_type=Float32,
-                        storage_type=CuArray,
-                        sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU
+                        storage_type=CuArray)
     # # Ensure that we do not have excessive memory allocations
     # # (e.g., from type instabilities)
     # @test_allocations(Trixi.rhs!, semi, sol, 1000)

From c0dd4b5bc0e4b3a93b6d20236c650caf51b41869 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Thu, 27 Nov 2025 11:31:14 +0100
Subject: [PATCH 80/81] fmt

---
 src/callbacks_step/stepsize_dg2d.jl   | 13 +++++----
 src/solvers/dgsem_p4est/dg_2d.jl      | 39 ++++++++++++++-------------
 src/solvers/dgsem_p4est/dg_3d.jl      | 18 ++++++++-----
 src/solvers/dgsem_structured/dg_1d.jl |  2 +-
 src/solvers/dgsem_structured/dg_2d.jl | 19 ++++++-------
 5 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index 8c5560f3b9d..d0612cc1d60 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -88,7 +88,7 @@ function max_dt(backend::Nothing, u, t,
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
-    @unpack contravariant_vectors, inverse_jacobian = cache
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
     @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
         max_lambda = max_scaled_speed_per_element(u, typeof(mesh), equations, dg,
                                                   contravariant_vectors,
@@ -105,7 +105,7 @@ function max_dt(backend::Backend, u, t,
                             T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::False, equations, dg::DG, cache)
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
-    num_elements = nelements(dg,cache)
+    num_elements = nelements(dg, cache)
     max_scaled_speeds = allocate(backend, eltype(t), num_elements)
 
     kernel! = max_scaled_speed_KAkernel!(backend)
@@ -172,8 +172,8 @@ function max_dt(backend::Nothing, u, t,
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
     @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
         max_lambda = max_scaled_speed_per_element(u, typeof(mesh), constant_speed,
-                                                 equations, dg, contravariant_vectors,
-                                                 inverse_jacobian, element)
+                                                  equations, dg, contravariant_vectors,
+                                                  inverse_jacobian, element)
         # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
         # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
         max_scaled_speed = Base.max(max_scaled_speed, max_lambda)
@@ -187,7 +187,7 @@ function max_dt(backend::Backend, u, t,
                             P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::True, equations, dg::DG, cache)
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
-    num_elements = nelements(dg,cache)
+    num_elements = nelements(dg, cache)
     max_scaled_speeds = allocate(backend, eltype(t), num_elements)
 
     kernel! = max_scaled_speed_KAkernel!(backend)
@@ -208,7 +208,6 @@ function max_scaled_speed_per_element(u,
                                       constant_speed::True, equations, dg::DG,
                                       contravariant_vectors, inverse_jacobian,
                                       element)
-
     max_lambda1_loc = max_lambda2_loc = nextfloat(zero(eltype(u)))
     max_lambda1, max_lambda2 = max_abs_speeds(equations)
     for j in eachnode(dg), i in eachnode(dg)
@@ -225,7 +224,7 @@ function max_scaled_speed_per_element(u,
         max_lambda1_loc = max(max_lambda1_loc, inv_jacobian * lambda1_transformed)
         max_lambda2_loc = max(max_lambda2_loc, inv_jacobian * lambda2_transformed)
     end
-    
+
     return max_lambda1_loc + max_lambda2_loc
 end
 
diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index a85d0de2392..e2e58ec2cd4 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -72,9 +72,9 @@ function prolong2interfaces!(backend::Nothing, cache, u,
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-       prolong2interfaces_interface!(interfaces.u, u, interface, typeof(mesh),
-                                     equations, neighbor_ids, node_indices,
-                                     index_range)
+        prolong2interfaces_per_interface!(interfaces.u, u, interface, typeof(mesh),
+                                          equations, neighbor_ids, node_indices,
+                                          index_range)
     end
     return nothing
 end
@@ -159,7 +159,6 @@ function calc_interface_flux!(backend::Nothing, surface_flux_values,
                                           T8codeMesh{2}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
-
     @unpack neighbor_ids, node_indices = cache.interfaces
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
@@ -181,7 +180,6 @@ function calc_interface_flux!(backend::Backend, surface_flux_values,
                                           T8codeMesh{2}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
-
     ninterfaces(cache.interfaces) == 0 && return nothing
     @unpack neighbor_ids, node_indices = cache.interfaces
     @unpack contravariant_vectors = cache.elements
@@ -191,7 +189,7 @@ function calc_interface_flux!(backend::Backend, surface_flux_values,
     kernel!(surface_flux_values, typeof(mesh), have_nonconservative_terms,
             equations, surface_integral, typeof(dg), cache.interfaces.u,
             neighbor_ids, node_indices, contravariant_vectors, index_range,
-            ndrange=ninterfaces(cache.interfaces))
+            ndrange = ninterfaces(cache.interfaces))
 
     return nothing
 end
@@ -275,7 +273,6 @@ function calc_interface_flux_per_interface!(surface_flux_values,
         # Increment the surface node index along the secondary element
         node_secondary += node_secondary_step
     end
-    
 
     return nothing
 end
@@ -363,8 +360,12 @@ end
         # Note the factor 0.5 necessary for the nonconservative fluxes based on
         # the interpretation of global SBP operators coupled discontinuously via
         # central fluxes/SATs
-        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = Float64(flux_[v] + 0.5f0 * noncons_primary[v])
-        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = Float64(-(flux_[v] + 0.5f0 * noncons_secondary[v]))
+        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = Float64(flux_[v] +
+                                                                                                             0.5f0 *
+                                                                                                             noncons_primary[v])
+        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = Float64(-(flux_[v] +
+                                                                                                                     0.5f0 *
+                                                                                                                     noncons_secondary[v]))
     end
 
     return nothing
@@ -847,7 +848,6 @@ end
     return nothing
 end
 
-
 function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                             T8codeMesh{2}},
@@ -869,12 +869,12 @@ function calc_surface_integral!(backend::Backend, du, u,
                                 equations,
                                 surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
-    nelements(dg,cache) == 0 && return nothing
+    nelements(dg, cache) == 0 && return nothing
     @unpack surface_flux_values = cache.elements
 
     kernel! = calc_surface_integral_KAkernel!(backend)
     kernel!(du, typeof(mesh), equations, surface_integral, dg,
-            surface_flux_values, ndrange=nelements(dg,cache))
+            surface_flux_values, ndrange = nelements(dg, cache))
     return nothing
 end
 
@@ -891,9 +891,10 @@ end
                                        dg, surface_flux_values, element)
 end
 
-function calc_surface_integral_per_element!(du, ::Type{<:Union{P4estMesh{2},
-                                                               P4estMeshView{2},
-                                                               T8codeMesh{2}}},
+function calc_surface_integral_per_element!(du,
+                                            ::Type{<:Union{P4estMesh{2},
+                                                           P4estMeshView{2},
+                                                           T8codeMesh{2}}},
                                             equations,
                                             surface_integral::SurfaceIntegralWeakForm,
                                             dg::DGSEM, surface_flux_values,
@@ -913,8 +914,8 @@ function calc_surface_integral_per_element!(du, ::Type{<:Union{P4estMesh{2},
 
             # surface at +x
             du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
-                                           surface_flux_values[v, l, 2, element] *
-                                           factor_2)
+                                             surface_flux_values[v, l, 2, element] *
+                                             factor_2)
 
             # surface at -y
             du[v, l, 1, element] = (du[v, l, 1, element] +
@@ -923,8 +924,8 @@ function calc_surface_integral_per_element!(du, ::Type{<:Union{P4estMesh{2},
 
             # surface at +y
             du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
-                                           surface_flux_values[v, l, 4, element] *
-                                           factor_2)
+                                             surface_flux_values[v, l, 4, element] *
+                                             factor_2)
         end
     end
     return nothing
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 65cffed4a38..c92a69777ef 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -370,8 +370,10 @@ end
                                       secondary_direction_index,
                                       secondary_element_index)
     calc_interface_flux!(surface_flux_values, meshT, have_nonconservative_terms,
-                         combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux, equations),
-                         equations, surface_integral, solverT, u_interface, interface_index,
+                         combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
+                                                                         equations),
+                         equations, surface_integral, solverT, u_interface,
+                         interface_index,
                          normal_direction, primary_i_node_index, primary_j_node_index,
                          primary_direction_index, primary_element_index,
                          secondary_i_node_index, secondary_j_node_index,
@@ -384,7 +386,8 @@ end
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::False,
                                       equations,
-                                      surface_integral, solverT::Type{<:DG}, u_interface,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
@@ -424,7 +427,8 @@ end
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::True,
                                       equations,
-                                      surface_integral, solverT::Type{<:DG}, u_interface,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
@@ -582,7 +586,7 @@ end
                                      direction_index,
                                      element_index, boundary_index)
     calc_boundary_flux!(surface_flux_values, t, boundary_condition, mesh,
-                        nonconservative_terms,
+                        have_nonconservative_terms,
                         combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
                                                                         equations),
                         equations,
@@ -594,7 +598,7 @@ end
 
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True,
+                                     have_nonconservative_terms::True,
                                      combine_conservative_and_nonconservative_fluxes::False,
                                      equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
@@ -637,7 +641,7 @@ end
 
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True,
+                                     have_nonconservative_terms::True,
                                      combine_conservative_and_nonconservative_fluxes::True,
                                      equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
diff --git a/src/solvers/dgsem_structured/dg_1d.jl b/src/solvers/dgsem_structured/dg_1d.jl
index cb98c45aed3..433d34e199f 100644
--- a/src/solvers/dgsem_structured/dg_1d.jl
+++ b/src/solvers/dgsem_structured/dg_1d.jl
@@ -69,7 +69,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::StructuredMesh{1},
+function apply_jacobian!(backend::Nothing, du, mesh::StructuredMesh{1},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index 89507a1b144..dc2dc3a119b 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -731,7 +731,7 @@ function apply_jacobian!(backend::Nothing, du,
                                      T8codeMesh{2}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
-    @threaded for element in eachelement(dg,cache)
+    @threaded for element in eachelement(dg, cache)
         apply_jacobian_per_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
                                     element)
     end
@@ -742,19 +742,20 @@ function apply_jacobian!(backend::Backend, du,
                                      UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
                                      T8codeMesh{2}},
                          equations, dg::DG, cache)
-    nelements(dg,cache) == 0 && return nothing
+    nelements(dg, cache) == 0 && return nothing
     @unpack inverse_jacobian = cache.elements
     kernel! = apply_jacobian_KAkernel!(backend)
     kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
-            ndrange=nelements(dg,cache))
+            ndrange = nelements(dg, cache))
 end
 
-@kernel function apply_jacobian_KAkernel!(du, mT::Type{<:Union{StructuredMesh{2},
-                                                               StructuredMeshView{2},
-                                                               UnstructuredMesh2D,
-                                                               P4estMesh{2},
-                                                               P4estMeshView{2},
-                                                               T8codeMesh{2}}},
+@kernel function apply_jacobian_KAkernel!(du,
+                                          mT::Type{<:Union{StructuredMesh{2},
+                                                           StructuredMeshView{2},
+                                                           UnstructuredMesh2D,
+                                                           P4estMesh{2},
+                                                           P4estMeshView{2},
+                                                           T8codeMesh{2}}},
                                           equations, dg::DG, inverse_jacobian)
     element = @index(Global)
     apply_jacobian_per_element!(du, mT, equations, dg, inverse_jacobian, element)

From f90f5a8866c69ab3d5669135c98628e10bf2ea4c Mon Sep 17 00:00:00 2001
From: Vivienne Ehlert <201612348+vivimie@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:28:41 +0100
Subject: [PATCH 81/81] fixes bugs in the CPU implementation

---
 src/solvers/dgsem_p4est/dg_2d.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index e2e58ec2cd4..11b19e19ffd 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -167,7 +167,7 @@ function calc_interface_flux!(backend::Nothing, surface_flux_values,
         calc_interface_flux_per_interface!(surface_flux_values, typeof(mesh),
                                            have_nonconservative_terms,
                                            equations, surface_integral, typeof(dg),
-                                           interface, cache.interfaces.u,
+                                           cache.interfaces.u, interface,
                                            neighbor_ids, node_indices,
                                            contravariant_vectors, index_range)
     end
@@ -857,9 +857,9 @@ function calc_surface_integral!(backend::Nothing, du, u,
     @unpack surface_flux_values = cache.elements
 
     @threaded for element in eachelement(dg, cache)
-        calc_surface_integral_per_element(du, typeof(mesh), equations,
-                                          surface_integral, dg,
-                                          surface_flux_values, element)
+        calc_surface_integral_per_element!(du, typeof(mesh), equations,
+                                           surface_integral, dg,
+                                           surface_flux_values, element)
     end
 end