From 60e1e5b99aa0aa919b7615662527847a564bf226 Mon Sep 17 00:00:00 2001
From: Ali Cheraghi <alichraghi@proton.me>
Date: Thu, 21 Nov 2024 17:41:35 +0330
Subject: [PATCH 001/346] build: Add ClangCL profiles

Also fix some compilation errors catched by clang

Signed-off-by: Ali Cheraghi <alichraghi@proton.me>
---
 3rdparty/CMakeLists.txt                       |  2 +-
 3rdparty/dxc/CMakeLists.txt                   |  2 +-
 CMakeLists.txt                                |  8 +--
 CMakePresets.json                             | 24 +++++--
 cmake/adjust/flags.cmake                      | 64 +++++++++++++++----
 cmake/adjust/template/vendor/CXX_Clang.cmake  | 45 +++++++++++++
 cmake/adjust/template/vendor/CXX_MSVC.cmake   | 46 +++++++++++++
 cmake/adjust/template/vendor/C_Clang.cmake    | 46 +++++++++++++
 .../msvc.cmake => vendor/C_MSVC.cmake}        | 31 ---------
 cmake/common.cmake                            |  6 +-
 include/nbl/asset/IFramebuffer.h              |  2 +-
 include/nbl/asset/IRenderpass.h               |  2 +-
 include/nbl/macros.h                          |  2 +-
 include/nbl/video/CVulkanDeviceMemoryBacked.h |  4 +-
 include/nbl/video/ISwapchain.h                |  6 +-
 include/nbl/video/TimelineEventHandlers.h     |  2 +-
 src/nbl/CMakeLists.txt                        |  2 +-
 src/nbl/builtin/utils.cmake                   |  4 +-
 18 files changed, 231 insertions(+), 67 deletions(-)
 create mode 100644 cmake/adjust/template/vendor/CXX_Clang.cmake
 create mode 100644 cmake/adjust/template/vendor/CXX_MSVC.cmake
 create mode 100644 cmake/adjust/template/vendor/C_Clang.cmake
 rename cmake/adjust/template/{windows/msvc.cmake => vendor/C_MSVC.cmake} (58%)

diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 7b9b6da784..d8ac2a0d25 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -493,7 +493,7 @@ if(ENABLE_HLSL)
 endif()
 
 foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS)
-		if(NBL_DYNAMIC_MSVC_RUNTIME)
+		if(NBL_COMPILER_DYNAMIC_RUNTIME)
 			set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 		else()
 			set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt
index 8b34c76f88..b6e3e21e16 100644
--- a/3rdparty/dxc/CMakeLists.txt
+++ b/3rdparty/dxc/CMakeLists.txt
@@ -62,7 +62,7 @@ if(WIN32)
 	endif()
 endif()
 
-if(NBL_DYNAMIC_MSVC_RUNTIME)
+if(NBL_COMPILER_DYNAMIC_RUNTIME)
 	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 else()
 	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>")		
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 339a89d27d..a8c9013eaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,10 +20,10 @@ if(MSVC)
 endif()
 
 option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared
-option(NBL_DYNAMIC_MSVC_RUNTIME "" ON)
+option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON)
 option(NBL_SANITIZE_ADDRESS OFF)
 
-if(MSVC)
+if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
 	if(NBL_SANITIZE_ADDRESS)
 		set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>")
 	else()
@@ -35,10 +35,10 @@ if(NBL_STATIC_BUILD)
 	message(STATUS "Static Nabla build enabled!")
 else()
 	if(MSVC)
-		if(NBL_DYNAMIC_MSVC_RUNTIME)
+		if(NBL_COMPILER_DYNAMIC_RUNTIME)
 			message(STATUS "Shared Nabla build enabled!")
 		else()
-			message(FATAL_ERROR "Turn NBL_DYNAMIC_MSVC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!")
+			message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!")
 		endif()
 	else()
 		message(FATAL_ERROR "Nabla can't be built with shared libraries! Please make sure you are targetting Windows OS and MSVC compiler!")
diff --git a/CMakePresets.json b/CMakePresets.json
index 8d0b62367a..da28fc1aff 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -46,7 +46,7 @@
 			"hidden": true,
 			"inherits": "ci-configure-static-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "OFF"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "OFF"
 			},
 			"condition": {
 				"type": "allOf",
@@ -69,7 +69,7 @@
 			"hidden": true,
 			"inherits": "ci-configure-dynamic-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "ON"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "ON"
 			},
 			"condition": {
 				"type": "allOf",
@@ -156,7 +156,7 @@
 			"hidden": true,
 			"inherits": "user-configure-static-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "OFF"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "OFF"
 			},
 			"condition": {
 				"type": "equals",
@@ -169,7 +169,7 @@
 			"hidden": true,
 			"inherits": "user-configure-dynamic-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "ON"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "ON"
 			},
 			"condition": {
 				"type": "equals",
@@ -193,6 +193,22 @@
 			"generator": "Visual Studio 17 2022",
 			"toolset": "v143"
 		},
+		{
+			"name": "user-configure-static-clangcl",
+			"inherits": "user-configure-static-windows-base",
+			"displayName": "[USER]: Static library target, Visual Studio 17 2022 generator, ClangCL toolset",
+			"description": "Configure as static library with Visual Studio 17 2022 generator and ClangCL toolset",
+			"generator": "Visual Studio 17 2022",
+			"toolset": "ClangCL"
+		},
+		{
+			"name": "user-configure-dynamic-clangcl",
+			"inherits": "user-configure-dynamic-windows-base",
+			"displayName": "[USER]: Dynamic library target, Visual Studio 17 2022 generator, ClangCL toolset",
+			"description": "Configure as dynamic library with Visual Studio 17 2022 generator and ClangCL toolset",
+			"generator": "Visual Studio 17 2022",
+			"toolset": "ClangCL"
+		},
 		{
 			"name": "user-configure-static-ninja-multi",
 			"inherits": "user-configure-static-windows-base",
diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index 59764cb02d..430d507c93 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -40,17 +40,57 @@ option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set ena
 option(NBL_REQUEST_SSE_AXV2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON)
 
 # profiles
-if(MSVC)
-	include("${CMAKE_CURRENT_LIST_DIR}/template/windows/msvc.cmake")
-elseif(ANDROID)
-	include("${CMAKE_CURRENT_LIST_DIR}/template/unix/android.cmake")
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-	include("${CMAKE_CURRENT_LIST_DIR}/template/unix/gnu.cmake")
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-	include("${CMAKE_CURRENT_LIST_DIR}/template/unix/clang.cmake")
-else()
-	message(WARNING "UNTESTED COMPILER DETECTED, EXPECT WRONG OPTIMIZATION FLAGS! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues")
-endif()
+foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX)
+    # all list of all known by CMake vendors:
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html
+    set(NBL_COMPILER_VENDOR "${CMAKE_${NBL_COMPILER_LANGUAGE}_COMPILER_ID}")
+    set(NBL_PROFILE_NAME "${NBL_COMPILER_LANGUAGE}_${NBL_COMPILER_VENDOR}") # eg. "cxx_MSVC.cmake"
+    set(NBL_PROFILE_PATH "${CMAKE_CURRENT_LIST_DIR}/template/vendor/${NBL_PROFILE_NAME}.cmake")
+
+    include("${NBL_PROFILE_PATH}" RESULT_VARIABLE _NBL_FOUND_)
+
+    if(NOT _NBL_FOUND_)
+        message(WARNING "UNSUPPORTED \"${NBL_COMPILER_LANGUAGE}\" COMPILER LANGUAGE FOR \"${NBL_COMPILER_VENDOR}\" DETECTED, CMAKE CONFIGURATION OR BUILD MAY FAIL AND COMPILE OPTIONS FLAGS WILL NOT BE SET! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues")
+        continue()
+    endif()
+
+    # a profile MUST define 
+        # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS" (configuration dependent)
+        # - "NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS" (global)
+
+    # a profile MUST NOT define
+        # - NBL_COMPILE_OPTIONS
+
+    set(NBL_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS)
+    set(NBL_COMPILE_OPTIONS_VAR_VALUE ${${NBL_COMPILE_OPTIONS_VAR_NAME}})
+
+    if(NOT DEFINED ${NBL_COMPILE_OPTIONS_VAR_NAME})
+        message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_COMPILE_OPTIONS_VAR_NAME}\"!")
+    endif()
+
+    # update map with configuration dependent compile options
+    foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG)
+        set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS)
+        set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}})
+
+        if(NOT DEFINED ${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME})
+            message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}\"!")
+        endif()
+
+        list(APPEND NBL_${CONFIGURATION}_COMPILE_OPTIONS
+            # note that "${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}" MUST NOT contain ANY 
+            # $<$<CONFIG:<>> generator expression in order to support our configuration mapping features
+            $<$<COMPILE_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}>
+        )
+
+        set(NBL_${CONFIGURATION}_COMPILE_OPTIONS  ${NBL_${CONFIGURATION}_COMPILE_OPTIONS})
+    endforeach()
+
+    # update map with global compile options
+    list(APPEND NBL_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS}>)
+
+    set(NBL_COMPILE_OPTIONS ${NBL_COMPILE_OPTIONS})
+endforeach()
 
 function(NBL_EXT_P_APPEND_COMPILE_OPTIONS NBL_LIST_NAME MAP_RELEASE MAP_RELWITHDEBINFO MAP_DEBUG)		
 	macro(NBL_MAP_CONFIGURATION NBL_CONFIG_FROM NBL_CONFIG_TO)
@@ -173,7 +213,7 @@ function(nbl_adjust_flags)
 			
 			set(MAPPED_CONFIG $<TARGET_GENEX_EVAL:${NBL_TARGET_ITEM},$<TARGET_PROPERTY:${NBL_TARGET_ITEM},NBL_CONFIGURATION_MAP>>)
 			
-			if(MSVC)
+			if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
 				if(NBL_SANITIZE_ADDRESS)
 					set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$<OR:$<STREQUAL:${MAPPED_CONFIG},DEBUG>,$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>>:ProgramDatabase>")
 				else()
diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake
new file mode 100644
index 0000000000..4ab7d4ae83
--- /dev/null
+++ b/cmake/adjust/template/vendor/CXX_Clang.cmake
@@ -0,0 +1,45 @@
+include_guard(GLOBAL)
+
+# Debug
+set(NBL_CXX_DEBUG_COMPILE_OPTIONS
+	-ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong
+)
+
+# Release
+set(NBL_CXX_RELEASE_COMPILE_OPTIONS
+	-fexpensive-optimizations
+)
+
+# RelWithDebInfo
+set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "")
+
+# Global
+list(APPEND NBL_CXX_COMPILE_OPTIONS
+	-Wextra
+	-fno-strict-aliasing
+	-msse4.2
+	-mfpmath=sse		
+	-Wextra
+	-Wno-sequence-point
+	-Wno-unused-parameter
+	-Wno-unused-but-set-parameter
+	-Wno-error=ignored-attributes
+	-Wno-error=unused-function
+	-Wno-error=unused-variable
+	-Wno-error=unused-parameter
+	-Wno-error=ignored-attributes
+	-Wno-error=non-pod-varargs
+	-fno-exceptions
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=address)
+endif()
+
+if(NBL_SANITIZE_THREAD)
+	list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=thread)
+endif()
+
+# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary
+# TODO: move it out from the profile
+link_libraries(-fuse-ld=gold)
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake
new file mode 100644
index 0000000000..8b07390ed6
--- /dev/null
+++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake
@@ -0,0 +1,46 @@
+include_guard(GLOBAL)
+
+# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
+
+# The default instruction set is SSE2 if no /arch option is specified.
+if(NBL_REQUEST_SSE_4_2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2")
+endif()
+
+# Enables Intel Advanced Vector Extensions 2.
+if(NBL_REQUEST_SSE_AXV2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2")
+endif()
+
+# Debug
+set(NBL_CXX_DEBUG_COMPILE_OPTIONS
+	/Zc:__cplusplus /Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	list(APPEND NBL_CXX_DEBUG_COMPILE_OPTIONS /RTC1)
+endif()
+
+set(NBL_DEBUG_CXX_COMPILE_OPTIONS
+	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_DEBUG_COMPILE_OPTIONS}>
+)
+
+# Release
+set(NBL_CXX_RELEASE_COMPILE_OPTIONS
+	/Zc:__cplusplus /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast
+)
+
+# RelWithDebInfo
+set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS
+	/Zc:__cplusplus /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	list(APPEND NBL_CXX_COMPILE_OPTIONS /fsanitize=address)
+endif()
+
+# this should also be not part of profile, pasting from old flags-set function temporary
+# TODO: use profile
+
+#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL.
+set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental")
diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake
new file mode 100644
index 0000000000..e4eb0d6ad9
--- /dev/null
+++ b/cmake/adjust/template/vendor/C_Clang.cmake
@@ -0,0 +1,46 @@
+include_guard(GLOBAL)
+
+# Debug
+set(NBL_C_DEBUG_COMPILE_OPTIONS
+	-ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong
+)
+
+# Release
+set(NBL_C_RELEASE_COMPILE_OPTIONS
+	-fexpensive-optimizations
+)
+
+# RelWithDebInfo
+set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "")
+
+# Global
+list(APPEND NBL_C_COMPILE_OPTIONS 
+	-Wextra
+	-fno-strict-aliasing
+	-msse4.2
+	-mfpmath=sse	
+	-maes	
+	-Wextra
+	-Wno-sequence-point
+	-Wno-unused-parameter
+	-Wno-unused-but-set-parameter
+	-Wno-error=ignored-attributes
+	-Wno-error=unused-function
+	-Wno-error=unused-variable
+	-Wno-error=unused-parameter
+	-Wno-error=ignored-attributes
+	-Wno-error=non-pod-varargs
+	-fno-exceptions
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=address)
+endif()
+
+if(NBL_SANITIZE_THREAD)
+	list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=thread)
+endif()
+
+# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary
+# TODO: move it out from the profile
+link_libraries(-fuse-ld=gold)
\ No newline at end of file
diff --git a/cmake/adjust/template/windows/msvc.cmake b/cmake/adjust/template/vendor/C_MSVC.cmake
similarity index 58%
rename from cmake/adjust/template/windows/msvc.cmake
rename to cmake/adjust/template/vendor/C_MSVC.cmake
index e0eaa82e80..76bace680f 100644
--- a/cmake/adjust/template/windows/msvc.cmake
+++ b/cmake/adjust/template/vendor/C_MSVC.cmake
@@ -21,51 +21,20 @@ if(NBL_SANITIZE_ADDRESS)
 	list(APPEND NBL_C_DEBUG_COMPILE_OPTIONS /RTC1)
 endif()
 
-set(NBL_CXX_DEBUG_COMPILE_OPTIONS
-	/Zc:__cplusplus ${NBL_C_DEBUG_COMPILE_OPTIONS}
-)
-
-set(NBL_DEBUG_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_DEBUG_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_DEBUG_COMPILE_OPTIONS}>
-)
-
 # Release
 set(NBL_C_RELEASE_COMPILE_OPTIONS
 	/O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast
 )
-set(NBL_CXX_RELEASE_COMPILE_OPTIONS
-	/Zc:__cplusplus ${NBL_C_RELEASE_COMPILE_OPTIONS}
-)
-
-set(NBL_RELEASE_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_RELEASE_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_RELEASE_COMPILE_OPTIONS}>
-)
 
 # RelWithDebInfo
 set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS
 	/O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast
 )
-set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS
-	/Zc:__cplusplus ${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS}
-)
-
-set(NBL_RELWITHDEBINFO_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS}>
-)
 
 if(NBL_SANITIZE_ADDRESS)
 	list(APPEND NBL_C_COMPILE_OPTIONS /fsanitize=address)
-	list(APPEND NBL_CXX_COMPILE_OPTIONS ${NBL_C_COMPILE_OPTIONS})
 endif()
 
-set(NBL_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_COMPILE_OPTIONS}>
-)
-
 # this should also be not part of profile, pasting from old flags-set function temporary
 # TODO: use profile
 
diff --git a/cmake/common.cmake b/cmake/common.cmake
index 86b1856ed3..d89c1ae071 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -25,7 +25,7 @@ function(nbl_handle_dll_definitions _TARGET_ _SCOPE_)
 		message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!")
 	endif()
 
-	if(NBL_DYNAMIC_MSVC_RUNTIME)
+	if(NBL_COMPILER_DYNAMIC_RUNTIME)
 		set(_NABLA_OUTPUT_DIR_ "${NBL_ROOT_PATH_BINARY}/src/nbl/$<CONFIG>/devshgraphicsprogramming.nabla")
 		
 		target_compile_definitions(${_TARGET_} ${_SCOPE_} 
@@ -43,7 +43,7 @@ function(nbl_handle_runtime_lib_properties _TARGET_)
 		message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!")
 	endif()
 
-	if(NBL_DYNAMIC_MSVC_RUNTIME)
+	if(MSVC)
 		set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 	else()
 		set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
@@ -75,7 +75,7 @@ macro(nbl_create_executable_project _EXTRA_SOURCES _EXTRA_OPTIONS _EXTRA_INCLUDE
 		nbl_handle_runtime_lib_properties(${EXECUTABLE_NAME})
 		
 		if(WIN32 AND MSVC)
-			if(NBL_DYNAMIC_MSVC_RUNTIME)
+			if(NBL_COMPILER_DYNAMIC_RUNTIME)
 				target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:$<TARGET_FILE_NAME:Nabla>")
 			endif()
 			
diff --git a/include/nbl/asset/IFramebuffer.h b/include/nbl/asset/IFramebuffer.h
index 9c78fe1e42..4f4abb89da 100644
--- a/include/nbl/asset/IFramebuffer.h
+++ b/include/nbl/asset/IFramebuffer.h
@@ -121,7 +121,7 @@ class IFramebuffer
                             return true;
 
                         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-pAttachments-00884
-                        if (viewParams.components!=ImageViewType::SComponentMapping())
+                        if (viewParams.components!=typename ImageViewType::SComponentMapping())
                             return true;
 
                         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-flags-04533
diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h
index 7595911716..1e44d8b526 100644
--- a/include/nbl/asset/IRenderpass.h
+++ b/include/nbl/asset/IRenderpass.h
@@ -707,7 +707,7 @@ inline bool IRenderpass::SCreationParams::SSubpassDescription::SDepthStencilAtta
 template<class attachment_ref_t>
 inline bool IRenderpass::SCreationParams::SSubpassDescription::SRenderAttachmentsRef<attachment_ref_t>::valid(const typename attachment_ref_t::description_t* descs, const uint32_t attachmentCount) const
 {
-    if (!render.valid<false>(descs,attachmentCount) || !resolve.valid<false>(descs,attachmentCount))
+    if (!render.template valid<false>(descs,attachmentCount) || !resolve.template valid<false>(descs,attachmentCount))
         return false;
     const bool renderUsed = render.used();
     if (resolve.used())
diff --git a/include/nbl/macros.h b/include/nbl/macros.h
index 4927f21899..fe93201a11 100644
--- a/include/nbl/macros.h
+++ b/include/nbl/macros.h
@@ -81,7 +81,7 @@
 
 //! Workarounds for compiler specific bugs
 // MSVC 2019 is a special snowflake
-#if defined(_MSC_VER) && _MSC_VER>=1920
+#if defined(_MSC_VER) && !defined(__clang__) && _MSC_VER>=1920
     #define NBL_TYPENAME_4_STTC_MBR typename
 #else
     #define NBL_TYPENAME_4_STTC_MBR
diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h
index c996000e04..e6d17ddf3e 100644
--- a/include/nbl/video/CVulkanDeviceMemoryBacked.h
+++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h
@@ -47,8 +47,8 @@ class CVulkanDeviceMemoryBacked : public Interface
 };
 
 #ifndef _NBL_VIDEO_C_VULKAN_DEVICE_MEMORY_BACKED_CPP_
-extern template CVulkanDeviceMemoryBacked<IGPUBuffer>;
-extern template CVulkanDeviceMemoryBacked<IGPUImage>;
+extern template class CVulkanDeviceMemoryBacked<IGPUBuffer>;
+extern template class CVulkanDeviceMemoryBacked<IGPUImage>;
 #endif
 
 } // end namespace nbl::video
diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h
index d052a819bd..99ba2e7975 100644
--- a/include/nbl/video/ISwapchain.h
+++ b/include/nbl/video/ISwapchain.h
@@ -21,6 +21,8 @@ class ISwapchain : public IBackendObject
 
         struct SSharedCreationParams
         {
+            SSharedCreationParams() {}
+
             inline bool valid(const IPhysicalDevice* physDev, const ISurface* surface) const
             {
                 ISurface::SCapabilities caps;
@@ -465,10 +467,10 @@ class ISwapchain : public IBackendObject
         virtual const void* getNativeHandle() const = 0;
         
         // returns the maximum number of time acquires with infinite timeout which can be called before releasing the image index through present.
-        virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0u;
+        virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0;
 
         // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal.
-        virtual uint8_t getMaxAcquiresInFlight() const = 0u;
+        virtual uint8_t getMaxAcquiresInFlight() const = 0;
 
         // only public because MultiTimelineEventHandlerST needs to know about it
         class DeferredFrameSemaphoreDrop final
diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 9405accf78..a3d6aa4c8b 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -410,7 +410,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                         sum += handler->count();
                     else
                     {
-                        const auto local = handler->poll_impl<false>(std::forward<Args>(args)...);
+                        const auto local = handler->template poll_impl<false>(std::forward<Args>(args)...);
                         bailed = local.bailed;
                         // if don't have any events left, remove the timeline
                         if (local.eventsLeft)
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 83845b9c84..f96e031fca 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -350,7 +350,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 	target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 endif()
 
-if(NBL_DYNAMIC_MSVC_RUNTIME)
+if(NBL_COMPILER_DYNAMIC_RUNTIME)
 	set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 else()
 	set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
diff --git a/src/nbl/builtin/utils.cmake b/src/nbl/builtin/utils.cmake
index 0a76d1c67e..04c15de86d 100644
--- a/src/nbl/builtin/utils.cmake
+++ b/src/nbl/builtin/utils.cmake
@@ -39,7 +39,7 @@ endmacro()
 # _NAMESPACE_ is a C++ namespace builtin resources will be wrapped into
 # _OUTPUT_INCLUDE_SEARCH_DIRECTORY_ is an absolute path to output directory for builtin resources header files which will be a search directory for generated headers outputed to ${_OUTPUT_HEADER_DIRECTORY_}/${_NAMESPACE_PREFIX_} where namespace prefix is the namespace turned into a path
 # _OUTPUT_SOURCE_DIRECTORY_ is an absolute path to output directory for builtin resources source files
-# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_DYNAMIC_MSVC_RUNTIME which is not an argument of this function
+# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_COMPILER_DYNAMIC_RUNTIME which is not an argument of this function
 #
 # As an example one could list a resource as following
 # LIST_BUILTIN_RESOURCE(SOME_RESOURCES_TO_EMBED "glsl/blit/default_compute_normalization.comp")
@@ -207,7 +207,7 @@ function(ADD_CUSTOM_BUILTIN_RESOURCES _TARGET_NAME_ _BUNDLE_NAME_ _BUNDLE_SEARCH
 	)
 	set_target_properties(${_TARGET_NAME_} PROPERTIES CXX_STANDARD 20)
 	
-	if(NBL_DYNAMIC_MSVC_RUNTIME)
+	if(NBL_COMPILER_DYNAMIC_RUNTIME)
 		set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 	else()
 		set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")

From 114c549f13f9d8e1de7b7ea6eb53daeacc2d78a5 Mon Sep 17 00:00:00 2001
From: Ali Cheraghi <alichraghi@proton.me>
Date: Thu, 21 Nov 2024 20:21:22 +0330
Subject: [PATCH 002/346] build: one liner ifs and some fixes

Signed-off-by: Ali Cheraghi <alichraghi@proton.me>
---
 3rdparty/CMakeLists.txt                     |  6 +-----
 3rdparty/dxc/CMakeLists.txt                 |  6 +-----
 CMakeLists.txt                              | 12 +++---------
 cmake/adjust/template/vendor/CXX_MSVC.cmake |  4 ----
 cmake/common.cmake                          |  6 +-----
 src/nbl/builtin/utils.cmake                 |  8 ++------
 6 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index d8ac2a0d25..b27ea0437c 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -493,11 +493,7 @@ if(ENABLE_HLSL)
 endif()
 
 foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS)
-		if(NBL_COMPILER_DYNAMIC_RUNTIME)
-			set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-		else()
-			set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-		endif()
+		set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 		
 		if(MSVC AND NBL_SANITIZE_ADDRESS)
 			set_property(TARGET ${trgt} PROPERTY COMPILE_OPTIONS /fsanitize=address)
diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt
index b6e3e21e16..2142a574ec 100644
--- a/3rdparty/dxc/CMakeLists.txt
+++ b/3rdparty/dxc/CMakeLists.txt
@@ -62,11 +62,7 @@ if(WIN32)
 	endif()
 endif()
 
-if(NBL_COMPILER_DYNAMIC_RUNTIME)
-	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-else()
-	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>")		
-endif()
+list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 
 # perform DXC compile standard requirement test
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8c9013eaa..68e913770c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared
 option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON)
 option(NBL_SANITIZE_ADDRESS OFF)
 
-if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
+if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
 	if(NBL_SANITIZE_ADDRESS)
 		set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>")
 	else()
@@ -34,14 +34,8 @@ endif()
 if(NBL_STATIC_BUILD)
 	message(STATUS "Static Nabla build enabled!")
 else()
-	if(MSVC)
-		if(NBL_COMPILER_DYNAMIC_RUNTIME)
-			message(STATUS "Shared Nabla build enabled!")
-		else()
-			message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!")
-		endif()
-	else()
-		message(FATAL_ERROR "Nabla can't be built with shared libraries! Please make sure you are targetting Windows OS and MSVC compiler!")
+	if(NOT NBL_COMPILER_DYNAMIC_RUNTIME)
+		message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic runtime is mandatory!")
 	endif()
 endif()
 
diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake
index 8b07390ed6..1abb66c9da 100644
--- a/cmake/adjust/template/vendor/CXX_MSVC.cmake
+++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake
@@ -21,10 +21,6 @@ if(NBL_SANITIZE_ADDRESS)
 	list(APPEND NBL_CXX_DEBUG_COMPILE_OPTIONS /RTC1)
 endif()
 
-set(NBL_DEBUG_CXX_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_DEBUG_COMPILE_OPTIONS}>
-)
-
 # Release
 set(NBL_CXX_RELEASE_COMPILE_OPTIONS
 	/Zc:__cplusplus /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast
diff --git a/cmake/common.cmake b/cmake/common.cmake
index d89c1ae071..c663a98443 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -43,11 +43,7 @@ function(nbl_handle_runtime_lib_properties _TARGET_)
 		message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!")
 	endif()
 
-	if(MSVC)
-		set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-	else()
-		set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-	endif()
+	set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 endfunction()
 
 # Macro creating project for an executable
diff --git a/src/nbl/builtin/utils.cmake b/src/nbl/builtin/utils.cmake
index 04c15de86d..e5b1741a95 100644
--- a/src/nbl/builtin/utils.cmake
+++ b/src/nbl/builtin/utils.cmake
@@ -206,12 +206,8 @@ function(ADD_CUSTOM_BUILTIN_RESOURCES _TARGET_NAME_ _BUNDLE_NAME_ _BUNDLE_SEARCH
 		"${_OUTPUT_HEADER_DIRECTORY_}"
 	)
 	set_target_properties(${_TARGET_NAME_} PROPERTIES CXX_STANDARD 20)
-	
-	if(NBL_COMPILER_DYNAMIC_RUNTIME)
-		set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-	else()
-		set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-	endif()
+
+	set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 	
 	set(NBL_BUILTIN_RESOURCES ${NBL_BUILTIN_RESOURCES}) # turn builtin resources paths list into variable
 	

From ff5513b33e434f8ce21f06d3e71c85b59f905c99 Mon Sep 17 00:00:00 2001
From: Ali Cheraghi <alichraghi@proton.me>
Date: Thu, 21 Nov 2024 21:43:55 +0330
Subject: [PATCH 003/346] update dxc submodule

Signed-off-by: Ali Cheraghi <alichraghi@proton.me>
---
 3rdparty/dxc/dxc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 5adc27f9e4..b8e1df19be 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367
+Subproject commit b8e1df19bebaf18ff1d6b9b90d7d020cf86f3205

From 44acfcfbbe0034946307f39cebaef809de386a47 Mon Sep 17 00:00:00 2001
From: Ali Cheraghi <alichraghi@proton.me>
Date: Fri, 22 Nov 2024 20:21:29 +0330
Subject: [PATCH 004/346] build: simplify if

Signed-off-by: Ali Cheraghi <alichraghi@proton.me>
---
 cmake/adjust/flags.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index 430d507c93..bec887ef1a 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -213,7 +213,7 @@ function(nbl_adjust_flags)
 			
 			set(MAPPED_CONFIG $<TARGET_GENEX_EVAL:${NBL_TARGET_ITEM},$<TARGET_PROPERTY:${NBL_TARGET_ITEM},NBL_CONFIGURATION_MAP>>)
 			
-			if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
+			if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
 				if(NBL_SANITIZE_ADDRESS)
 					set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$<OR:$<STREQUAL:${MAPPED_CONFIG},DEBUG>,$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>>:ProgramDatabase>")
 				else()

From 2b4a1214177aa5db5514664e47c7ba5f6f42fdaa Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 4 Apr 2025 12:53:51 +0200
Subject: [PATCH 005/346] save work

---
 tools/nsc/CMakeLists.txt | 69 ++++++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 1582e9ecd6..bb45442982 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -120,7 +120,9 @@ set(NBL_CE_GENERATE_CONFIG_COMMAND
   -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
 )
 
-set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_ROOT_PATH}/docker/compiler-explorer/compose.yml")
+set(NBL_DOCKER_CE_DOCKER_CTX "${NBL_ROOT_PATH}/docker/compiler-explorer")
+set(NBL_DOCKER_CE_DOCKERFILE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/Dockerfile")
+set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/compose.yml")
 cmake_path(NATIVE_PATH NBL_DOCKER_CE_COMPOSE_BASE NORMALIZE NBL_DOCKER_CE_COMPOSE_BASE)
 set(NBL_DOCKER_CE_COMPOSE_TARGET "${GODBOLT_BINARY_DIRECTORY}/.dev-compose.yml")
 
@@ -273,20 +275,21 @@ ON
 set(BASE_IMAGE dr.devsh.eu/compiler-explorer/windows)
 
 # NOTE to self: could be all done with single docker file & compose file but buildkit works bad with windows driver, yet need to wait for stuff to be implemented
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/devel")
+set(DEVEL_CTX "${CMAKE_CURRENT_BINARY_DIR}/docker/devel")
 set(CT_REDIST_DIR "${CT_TOOLSET_REDIST_TARGET}/${REDIST_CRT_TOOLSET_VERSION}")
 set(CT_NONREDIST_CTR_DIR "${CT_REDIST_DIR}/${DEBUG_CRT_RELATIVE}")
 cmake_path(NATIVE_PATH CT_REDIST_DIR NORMALIZE CT_REDIST_DIR)
 cmake_path(NATIVE_PATH CT_NONREDIST_CTR_DIR NORMALIZE CT_NONREDIST_CTR_DIR)
-set(DEVEL_DOCKERFILE "${OUTPUT_DIRECTORY}/Dockerfile")
+set(DEVEL_DOCKERFILE "${DEVEL_CTX}/Dockerfile")
 
-GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}"
+GEN_DOCKER_CONTENT("" "${DEVEL_CTX}"
 [=[
 
-COPY --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@
-COPY --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@
+COPY --link --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@
+COPY --link --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@
 
-RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install 
+# TODO
+# RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install 
 RUN xcopy .\@CT_NONREDIST_CTR_DIR@\*.dll %SystemRoot%\System32 /Y
 RUN xcopy .\@CT_TOOLSET_REDIST_TARGET@\ucrtbased.dll %SystemRoot%\System32 /Y
 
@@ -348,8 +351,56 @@ string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY)
 file(WRITE "${NBL_DOCKER_CE_COMPOSE_TARGET}" "${COMPOSE_CONTENT}")
 make_directory("${GODBOLT_BINARY_DIRECTORY}/.ctx")
 
-execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_BASE}" build)
-execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" build)
+function(_PROMOTE_PROCESS_ISOLATION_ KERNEL BASES VAR)
+    set(${VAR} True)
+    set(ix 0)
+    list(LENGTH BASES LEN)
+
+    while(ix LESS ${LEN})
+        list(GET BASES ${ix} BASE)
+
+        execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${BASE} RESULT_VARIABLE EXIT_LEVEL OUTPUT_VARIABLE TARGET_KERNEL OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+        if(${EXIT_LEVEL} EQUAL 0)
+            if(${KERNEL} VERSION_LESS ${TARGET_KERNEL})
+                set(${VAR} False PARENT_SCOPE)
+                message(STATUS "While inspecting ${BASE} - host Kernel ${KERNEL} too low to use container process isolation (target ${TARGET_KERNEL}), falling back to HyperV. Please update your host OS.")
+                return()
+            endif()
+            math(EXPR ix "${ix} + 1")
+        else()
+            message(STATUS "Docker image ${BASE} not found locally, pulling...")
+            execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE})
+        endif()
+    endwhile()
+
+    set(${VAR} ${${VAR}} PARENT_SCOPE)
+endfunction()
+
+execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE)
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}")
+
+set(BASES
+  mcr.microsoft.com/windows/nanoserver:ltsc2022
+  mcr.microsoft.com/powershell:lts-nanoserver-ltsc2022
+)
+
+_PROMOTE_PROCESS_ISOLATION_("${HOST_KERNEL}" "${BASES}" PROMOTE_TO_PROCESS)
+
+function(_BUILD_IMAGE_ DOCKERFILE CTX TAG)
+  set(CMD "${DOCKER_EXE}" build)
+  if(PROMOTE_TO_PROCESS)
+    list(APPEND CMD --isolation "process")
+  endif()
+  list(APPEND CMD -t ${TAG} -f "${DOCKERFILE}" .)
+
+  execute_process(COMMAND ${CMD} WORKING_DIRECTORY "${CTX}") 
+endfunction()
+
+_BUILD_IMAGE_("${NBL_DOCKER_CE_DOCKERFILE_BASE}" "${NBL_DOCKER_CE_DOCKER_CTX}" godbolt/base/windows)
+_BUILD_IMAGE_("${DEVEL_DOCKERFILE}" "${DEVEL_CTX}" godbolt/devel/windows)
+
+message(FATAL_ERROR "STOP TEST, PROMOTE_TO_PROCESS = ${PROMOTE_TO_PROCESS}")
 
 string(APPEND BAT_PRODUCTION_INSTALL
 [=[

From 616f7d7b210b95c079da20659f8762b7f1a743ae Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Tue, 8 Apr 2025 11:01:50 +0200
Subject: [PATCH 006/346] fixing CLang build, save work

---
 cmake/adjust/template/vendor/CXX_Clang.cmake           |  7 +++++++
 cmake/adjust/template/vendor/C_Clang.cmake             | 10 ++++++++--
 include/nbl/asset/IDescriptorSetLayout.h               |  4 ++--
 include/nbl/asset/IRenderpass.h                        |  1 +
 include/nbl/asset/utils/CSPIRVIntrospector.h           |  8 ++++----
 .../builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl  |  2 +-
 6 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake
index 4ab7d4ae83..258fef3d8a 100644
--- a/cmake/adjust/template/vendor/CXX_Clang.cmake
+++ b/cmake/adjust/template/vendor/CXX_Clang.cmake
@@ -15,14 +15,21 @@ set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "")
 
 # Global
 list(APPEND NBL_CXX_COMPILE_OPTIONS
+	-Wno-everything # TMP
 	-Wextra
 	-fno-strict-aliasing
 	-msse4.2
+	-maes
 	-mfpmath=sse		
 	-Wextra
 	-Wno-sequence-point
 	-Wno-unused-parameter
 	-Wno-unused-but-set-parameter
+	-Wno-c++98-compat
+	-Wno-c++98-compat-pedantic
+	-Wno-padded
+	-Wno-unsafe-buffer-usage
+	-Wno-switch-enum
 	-Wno-error=ignored-attributes
 	-Wno-error=unused-function
 	-Wno-error=unused-variable
diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake
index e4eb0d6ad9..3dc21dec15 100644
--- a/cmake/adjust/template/vendor/C_Clang.cmake
+++ b/cmake/adjust/template/vendor/C_Clang.cmake
@@ -15,15 +15,21 @@ set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "")
 
 # Global
 list(APPEND NBL_C_COMPILE_OPTIONS 
+	-Wno-everything # TMP
 	-Wextra
 	-fno-strict-aliasing
 	-msse4.2
-	-mfpmath=sse	
-	-maes	
+	-maes
+	-mfpmath=sse		
 	-Wextra
 	-Wno-sequence-point
 	-Wno-unused-parameter
 	-Wno-unused-but-set-parameter
+	-Wno-c++98-compat
+	-Wno-c++98-compat-pedantic
+	-Wno-padded
+	-Wno-unsafe-buffer-usage
+	-Wno-switch-enum
 	-Wno-error=ignored-attributes
 	-Wno-error=unused-function
 	-Wno-error=unused-variable
diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h
index 44e8be71ea..ec3c182fdc 100644
--- a/include/nbl/asset/IDescriptorSetLayout.h
+++ b/include/nbl/asset/IDescriptorSetLayout.h
@@ -330,7 +330,7 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase
 				bindings[i].binding = i;
 				bindings[i].type = type;
 				bindings[i].createFlags = SBinding::E_CREATE_FLAGS::ECF_NONE;
-				bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::ESS_ALL_OR_LIBRARY;
+				bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY;
 				bindings[i].count = counts ? counts[i]:1u;
 				bindings[i].samplers = nullptr;
 			}
@@ -354,7 +354,7 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase
 				for (uint32_t b = 0u; b < bindingCnt; ++b)
 				{
 					auto bindingNumber = m_descriptorRedirects[t].m_bindingNumbers[b];
-					CBindingRedirect::template binding_number_t otherBindingNumber(CBindingRedirect::Invalid);
+					CBindingRedirect::binding_number_t otherBindingNumber(CBindingRedirect::Invalid);
 					// TODO: std::find instead?
 					for (uint32_t ob = 0u; ob < otherBindingCnt; ++ob)
 					{
diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h
index 657b0fcaff..b9554fc2a6 100644
--- a/include/nbl/asset/IRenderpass.h
+++ b/include/nbl/asset/IRenderpass.h
@@ -81,6 +81,7 @@ class NBL_API2 IRenderpass
                 {
                     bool valid() const;
                 };
+
                 // The arrays pointed to by this array must be terminated by `DepthStencilAttachmentsEnd` value, which implicitly satisfies a few VUIDs
                 constexpr static inline SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd = {};
                 const SDepthStencilAttachmentDescription* depthStencilAttachments = &DepthStencilAttachmentsEnd;
diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h
index f756a58a42..7a2310a62e 100644
--- a/include/nbl/asset/utils/CSPIRVIntrospector.h
+++ b/include/nbl/asset/utils/CSPIRVIntrospector.h
@@ -326,8 +326,8 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 						template<bool C=!Mutable>
 						inline std::enable_if_t<C,bool> isLastMemberRuntimeSized() const
 						{
-							if (type->memberCount)
-								return type->memberTypes()[type->memberCount-1].count.front().isRuntimeSized();
+							if (this->type->memberCount)
+								return this->type->memberTypes()[this->type->memberCount-1].count.front().isRuntimeSized();
 							return false;
 						}
 						template<bool C=!Mutable>
@@ -335,9 +335,9 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 						{
 							if (isLastMemberRuntimeSized())
 							{
-								const auto& lastMember = type->memberTypes()[type->memberCount-1];
+								const auto& lastMember = this->type->memberTypes()[this->type->memberCount-1];
 								assert(!lastMember.count.front().isSpecConstantID);
-								return sizeWithoutLastMember+lastMemberElementCount*type->memberStrides()[type->memberCount-1];
+								return sizeWithoutLastMember+lastMemberElementCount* this->type->memberStrides()[this->type->memberCount-1];
 							}
 							return sizeWithoutLastMember;
 						}
diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
index 0309b78e0d..94da595ef2 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
@@ -598,7 +598,7 @@ struct nClamp_helper<T>
 	using return_t = T;
 	static inline return_t __call(const T x, const T _min, const T _max)
 	{
-		return nMin_helper::_call(nMax_helper::_call(x, _min), _max);
+		return nMin_helper<T>::_call(nMin_helper<T>::_call(x, _min), _max);
 	}
 };
 

From aad8bb1445ffece46681f11c73bf5372421ea5d0 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Tue, 8 Apr 2025 14:23:40 +0200
Subject: [PATCH 007/346] make Nabla Clang build work

---
 cmake/adjust/template/vendor/CXX_Clang.cmake     |  1 -
 cmake/adjust/template/vendor/C_Clang.cmake       |  3 +--
 include/nbl/asset/IRenderpass.h                  | 16 +++++++++++-----
 include/nbl/asset/filters/CBlitImageFilter.h     |  2 +-
 .../nbl/asset/filters/kernels/WeightFunctions.h  |  4 ++--
 include/nbl/asset/utils/CSPIRVIntrospector.h     |  8 +++++++-
 include/nbl/video/ILogicalDevice.h               |  2 +-
 include/nbl/video/utilities/CSubpassKiln.h       |  2 +-
 src/nbl/video/CVulkanCommandBuffer.cpp           |  2 +-
 src/nbl/video/CVulkanDeviceMemoryBacked.cpp      |  4 ++--
 src/nbl/video/IGPUAccelerationStructure.cpp      |  8 ++++----
 src/nbl/video/utilities/CAssetConverter.cpp      | 14 +++++++-------
 12 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake
index 258fef3d8a..62c12075d1 100644
--- a/cmake/adjust/template/vendor/CXX_Clang.cmake
+++ b/cmake/adjust/template/vendor/CXX_Clang.cmake
@@ -15,7 +15,6 @@ set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "")
 
 # Global
 list(APPEND NBL_CXX_COMPILE_OPTIONS
-	-Wno-everything # TMP
 	-Wextra
 	-fno-strict-aliasing
 	-msse4.2
diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake
index 3dc21dec15..1c00f78e84 100644
--- a/cmake/adjust/template/vendor/C_Clang.cmake
+++ b/cmake/adjust/template/vendor/C_Clang.cmake
@@ -14,8 +14,7 @@ set(NBL_C_RELEASE_COMPILE_OPTIONS
 set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "")
 
 # Global
-list(APPEND NBL_C_COMPILE_OPTIONS 
-	-Wno-everything # TMP
+list(APPEND NBL_C_COMPILE_OPTIONS
 	-Wextra
 	-fno-strict-aliasing
 	-msse4.2
diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h
index b9554fc2a6..ce41e35573 100644
--- a/include/nbl/asset/IRenderpass.h
+++ b/include/nbl/asset/IRenderpass.h
@@ -83,10 +83,10 @@ class NBL_API2 IRenderpass
                 };
 
                 // The arrays pointed to by this array must be terminated by `DepthStencilAttachmentsEnd` value, which implicitly satisfies a few VUIDs
-                constexpr static inline SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd = {};
+                static const SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165
                 const SDepthStencilAttachmentDescription* depthStencilAttachments = &DepthStencilAttachmentsEnd;
                 // The arrays pointed to by this array must be terminated by `ColorAttachmentsEnd` value, which implicitly satisfies a few VUIDs
-                constexpr static inline SColorAttachmentDescription ColorAttachmentsEnd = {};
+                static const SColorAttachmentDescription ColorAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165
                 const SColorAttachmentDescription* colorAttachments = &ColorAttachmentsEnd;
 
                 struct SSubpassDescription final
@@ -200,7 +200,7 @@ class NBL_API2 IRenderpass
                     SColorAttachmentsRef colorAttachments[MaxColorAttachments] = {};
 
                     // The arrays pointed to by this array must be terminated by `InputAttachmentsEnd` value
-                    constexpr static inline SInputAttachmentRef InputAttachmentsEnd = {};
+                    static const SInputAttachmentRef InputAttachmentsEnd;
                     const SInputAttachmentRef* inputAttachments = &InputAttachmentsEnd;
 
                     struct SPreserveAttachmentRef
@@ -233,7 +233,7 @@ class NBL_API2 IRenderpass
                     // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSubpassDescription2.html#VUID-VkSubpassDescription2-pipelineBindPoint-04953
                     //E_PIPELINE_BIND_POINT pipelineBindPoint : 2 = EPBP_GRAPHICS;
                 };
-                constexpr static inline SSubpassDescription SubpassesEnd = {};
+                static const SSubpassDescription SubpassesEnd;
                 const SSubpassDescription* subpasses = &SubpassesEnd;
 
                 struct SSubpassDependency final
@@ -259,7 +259,7 @@ class NBL_API2 IRenderpass
                     bool valid() const;
                 };
                 // The arrays pointed to by this array must be terminated by `DependenciesEnd` value
-                constexpr static inline SSubpassDependency DependenciesEnd = {};
+                static const SSubpassDependency DependenciesEnd;
                 const SSubpassDependency* dependencies = &DependenciesEnd;
 
 
@@ -380,6 +380,12 @@ class NBL_API2 IRenderpass
         uint32_t m_loadOpColorAttachmentEnd = ~0u;
 };
 
+constexpr inline IRenderpass::SCreationParams::SDepthStencilAttachmentDescription IRenderpass::SCreationParams::DepthStencilAttachmentsEnd = {};
+constexpr inline IRenderpass::SCreationParams::SColorAttachmentDescription IRenderpass::SCreationParams::ColorAttachmentsEnd = {};
+constexpr inline IRenderpass::SCreationParams::SSubpassDescription::SInputAttachmentRef IRenderpass::SCreationParams::SSubpassDescription::InputAttachmentsEnd = {};
+constexpr inline IRenderpass::SCreationParams::SSubpassDescription IRenderpass::SCreationParams::SubpassesEnd = {};
+constexpr inline IRenderpass::SCreationParams::SSubpassDependency IRenderpass::SCreationParams::DependenciesEnd = {};
+
 inline bool IRenderpass::compatible(const IRenderpass* other) const
 {
     // If you find yourself spending a lot of time here in your profile, go ahead and implement a precomputed hash and store it in the renderpass
diff --git a/include/nbl/asset/filters/CBlitImageFilter.h b/include/nbl/asset/filters/CBlitImageFilter.h
index 1dbc7809ba..f228fea325 100644
--- a/include/nbl/asset/filters/CBlitImageFilter.h
+++ b/include/nbl/asset/filters/CBlitImageFilter.h
@@ -464,7 +464,7 @@ class CBlitImageFilter :
 
 			auto phaseCount = IBlitUtilities::getPhaseCount(inExtentLayerCount.xyz, outExtentLayerCount.xyz, inImageType);
 			phaseCount = hlsl::max(phaseCount,hlsl::uint32_t3(1,1,1));
-			const auto axisOffsets = blit_utils_t::template getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size);
+			const auto axisOffsets = blit_utils_t::getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size);
 			constexpr auto MaxAxisCount = 3;
 			lut_value_t* scaledKernelPhasedLUTPixel[MaxAxisCount];
 			for (auto i = 0; i < MaxAxisCount; ++i)
diff --git a/include/nbl/asset/filters/kernels/WeightFunctions.h b/include/nbl/asset/filters/kernels/WeightFunctions.h
index bb0b8fb9b4..af2782dfac 100644
--- a/include/nbl/asset/filters/kernels/WeightFunctions.h
+++ b/include/nbl/asset/filters/kernels/WeightFunctions.h
@@ -337,12 +337,12 @@ class CWeightFunction1D final : public impl::IWeightFunction1D<decltype(std::dec
 		inline void stretchAndScale(const float stretchFactor)
 		{
 			stretch(stretchFactor);
-			this->scale(base_t::value_t(1)/stretchFactor);
+			this->scale(typename base_t::value_t(1)/stretchFactor);
 		}
 
 		inline base_t::value_t weight(const float x) const
 		{
-			return static_cast<double>(this->getTotalScale()*function_t::weight<derivative>(x*this->getInvStretch()));
+			return static_cast<double>(this->getTotalScale()*function_t::template weight<derivative>(x*this->getInvStretch()));
 		}
 
 		// Integral of `weight(x) dx` from -INF to +INF
diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h
index 7a2310a62e..45fcb0e3a7 100644
--- a/include/nbl/asset/utils/CSPIRVIntrospector.h
+++ b/include/nbl/asset/utils/CSPIRVIntrospector.h
@@ -208,7 +208,13 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 						// `memberStrides[i]` only relevant if `memberTypes[i]->isArray()`
 						inline ptr_t<member_stride_t,Mutable> memberStrides() const {return memberOffsets()+memberCount;}
 						using member_matrix_info_t = MatrixInfo;
-						inline ptr_t<member_matrix_info_t,Mutable> memberMatrixInfos() const {return reinterpret_cast<ptr_t<member_matrix_info_t,Mutable>&>(memberStrides()+memberCount); }
+						inline ptr_t<member_matrix_info_t,Mutable> memberMatrixInfos() const 
+						{
+							auto t = memberStrides() + memberCount;
+
+							return reinterpret_cast<ptr_t<member_matrix_info_t,Mutable>&>(t);
+						
+						}
 
 						constexpr static inline size_t StoragePerMember = sizeof(member_type_t)+sizeof(member_name_t)+sizeof(member_size_t)+sizeof(member_offset_t)+sizeof(member_stride_t)+sizeof(member_matrix_info_t);
 
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 0cc6608b16..46f7dc1ce7 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -1502,7 +1502,7 @@ inline bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyInde
             return false;
         };
         // CANNOT CHECK: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkImageMemoryBarrier2-oldLayout-01197
-        if (mismatchedLayout.operator()<false>(barrier.oldLayout) || mismatchedLayout.operator()<true>(barrier.newLayout))
+        if (mismatchedLayout.template operator()<false>(barrier.oldLayout) || mismatchedLayout.template operator()<true>(barrier.newLayout))
             return false;
     }
 
diff --git a/include/nbl/video/utilities/CSubpassKiln.h b/include/nbl/video/utilities/CSubpassKiln.h
index 7df6cc0caa..c41ec3dd7e 100644
--- a/include/nbl/video/utilities/CSubpassKiln.h
+++ b/include/nbl/video/utilities/CSubpassKiln.h
@@ -198,7 +198,7 @@ class CSubpassKiln
             if (begin==end)
                 return;
 
-            bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().indirectDrawCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end);
+            bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().drawIndirectCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end);
         }
 
     protected:
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index b569a5fde2..9f0b0a83e1 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -661,7 +661,7 @@ bool CVulkanCommandBuffer::beginRenderPass_impl(const SRenderpassBeginInfo& info
         .renderArea = info.renderArea,
         // Implicitly but could be optimizedif needed
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-00902
-        .clearValueCount = vk_clearValues.size()/sizeof(VkClearValue),
+        .clearValueCount = static_cast<uint32_t>(vk_clearValues.size()/sizeof(VkClearValue)),
         // Implicit
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-04962
         .pClearValues = vk_clearValues.data()
diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
index 2bec9e9d06..90b2993cb3 100644
--- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
+++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
@@ -40,7 +40,7 @@ CVulkanDeviceMemoryBacked<Interface>::CVulkanDeviceMemoryBacked(
     assert(vkHandle!=VK_NULL_HANDLE);
 }
 
-template CVulkanDeviceMemoryBacked<IGPUBuffer>;
-template CVulkanDeviceMemoryBacked<IGPUImage>;
+template class CVulkanDeviceMemoryBacked<IGPUBuffer>;
+template class CVulkanDeviceMemoryBacked<IGPUImage>;
 
 }
\ No newline at end of file
diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp
index eafbe08d6f..ae78754b1e 100644
--- a/src/nbl/video/IGPUAccelerationStructure.cpp
+++ b/src/nbl/video/IGPUAccelerationStructure.cpp
@@ -140,11 +140,11 @@ uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<BufferType>::valid(cons
 	retval += geometryCount*MaxBuffersPerGeometry;
 	return retval;
 }
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::template valid<uint32_t>(const uint32_t* const) const;
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::template valid<uint32_t>(const uint32_t* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::valid<uint32_t>(const uint32_t* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::valid<uint32_t>(const uint32_t* const) const;
 using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo;
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::template valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::template valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
 
 bool IGPUBottomLevelAccelerationStructure::validVertexFormat(const asset::E_FORMAT format) const
 {
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 796f3dcaec..fdb5c61ca8 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2142,7 +2142,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			{
 				for (auto& entry : conversionRequests)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
-					assign.operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
+					assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
 			{
@@ -2461,7 +2461,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						// since we don't have dependants we don't care about our group ID
 						// we create threadsafe pipeline caches, because we have no idea how they may be used
-						assign.operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false));
+						assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false));
 					}
 				}
 			}
@@ -2506,7 +2506,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						// since we don't have dependants we don't care about our group ID
 						// we create threadsafe pipeline caches, because we have no idea how they may be used
-						assign.operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters()));
+						assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters()));
 					}
 				}
 			}
@@ -2653,7 +2653,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						gpuObj.get()->setObjectDebugName(debugName.str().c_str());
 					}
 					// insert into staging cache
-					stagingCache.emplace(gpuObj.get(),CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
+					stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
 					// propagate back to dfsCache
 					created.gpuObj = std::move(gpuObj);
 					// record if a device memory allocation will be needed
@@ -2668,11 +2668,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					// this is super annoying, was hoping metaprogramming with `has_type` would actually work
 					auto getConversionRequests = [&]<typename AssetU>()->auto&{return std::get<SReserveResult::conversion_requests_t<AssetU>>(retval.m_conversionRequests);};
 					if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
-						getConversionRequests.operator()<ICPUBuffer>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get());;
+						getConversionRequests.template operator()<ICPUBuffer>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get());;
 					if constexpr (std::is_same_v<AssetType,ICPUImage>)
 					{
 						const uint16_t recomputeMips = created.patch.recomputeMips;
-						getConversionRequests.operator()<ICPUImage>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get(),recomputeMips);
+						getConversionRequests.template operator()<ICPUImage>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get(),recomputeMips);
 					}
 					// TODO: BLAS and TLAS requests
 				}
@@ -2939,7 +2939,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				// if something with this content hash is in the stagingCache, then it must match the `found->gpuObj`
 				if (auto finalCacheIt=stagingCache.find(gpuObj.get()); finalCacheIt!=stagingCache.end())
 				{
-					const bool matches = finalCacheIt->second==CCache<AssetType>::key_t(found.contentHash,uniqueCopyGroupID);
+					const bool matches = finalCacheIt->second==typename CCache<AssetType>::key_t(found.contentHash,uniqueCopyGroupID);
 					assert(matches);
 				}
 			}

From 7b8cb61f0cbd56580a216e02c87a3627a28d7a5d Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Tue, 8 Apr 2025 15:05:51 +0200
Subject: [PATCH 008/346] bad typo

---
 include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
index 94da595ef2..0d95c032b0 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
@@ -598,7 +598,7 @@ struct nClamp_helper<T>
 	using return_t = T;
 	static inline return_t __call(const T x, const T _min, const T _max)
 	{
-		return nMin_helper<T>::_call(nMin_helper<T>::_call(x, _min), _max);
+		return nMin_helper<T>::_call(nMax_helper<T>::_call(x, _min), _max);
 	}
 };
 

From 062b5baa4ae3af2284f39b9b6d983b8a55c354a7 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 11 Apr 2025 12:00:42 +0200
Subject: [PATCH 009/346] update profiles & flags check handle, take care of
 Clang profile; enter new compile errors after upgrading VS and toolsets
 (coming from DXC source files)

---
 cmake/adjust/flags.cmake                      | 59 ++++++++++-----
 cmake/adjust/template/vendor/CXX_Clang.cmake  | 52 +------------
 cmake/adjust/template/vendor/CXX_MSVC.cmake   | 43 +----------
 cmake/adjust/template/vendor/C_Clang.cmake    | 52 +------------
 cmake/adjust/template/vendor/C_MSVC.cmake     | 45 +----------
 cmake/adjust/template/vendor/impl/Clang.cmake | 75 +++++++++++++++++++
 cmake/adjust/template/vendor/impl/MSVC.cmake  | 71 ++++++++++++++++++
 cmake/adjust/template/vendor/impl/reset.cmake |  8 ++
 8 files changed, 208 insertions(+), 197 deletions(-)
 create mode 100644 cmake/adjust/template/vendor/impl/Clang.cmake
 create mode 100644 cmake/adjust/template/vendor/impl/MSVC.cmake
 create mode 100644 cmake/adjust/template/vendor/impl/reset.cmake

diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index eb16a95791..ead5a086e6 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -12,32 +12,57 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP
   BRIEF_DOCS "Stores configuration map for a target, it will evaluate to the configuration it's mapped to"
 )
 
-function(NBL_REQUEST_COMPILE_OPTION_SUPPORT _NBL_COMPILE_OPTION_)
-    set(NBL_COMPILE_OPTION "${_NBL_COMPILE_OPTION_}")
+# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG <LANG;...> CONFIG <CONFIG;...> OPTIONS <OPTIONS;...> )
+# LANG, CONFIG - optional, OPTIONS - required
+function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
+	cmake_parse_arguments(IMPL "" "" "LANG;CONFIG;OPTIONS" ${ARGN})
 
-    foreach(COMPILER IN ITEMS c cxx)
+	set(DEFAULT_COMPILERS c cxx)
+
+	if(NOT IMPL_LANG)
+        list(APPEND IMPL_LANG ${DEFAULT_COMPILERS})
+    endif()
+
+    if(NOT IMPL_OPTIONS)
+        message(FATAL_ERROR "NBL_REQUEST_COMPILE_OPTION_SUPPORT's OPTIONS empty!")
+    endif()
+
+    foreach(COMPILER IN ITEMS ${IMPL_LANG})
         string(TOUPPER "${COMPILER}" COMPILER_UPPER)
 
-        string(REGEX REPLACE "[-=:;/.]" "_" flag_signature "${NBL_COMPILE_OPTION}")
-        set(flag_var "__${COMPILER_UPPER}_Flag_${flag_signature}")
+		if(COMPILER_UPPER STREQUAL C)
+			macro(VALIDATE_FLAG) 
+				check_c_compiler_flag(${ARGV})
+			endmacro()
+		elseif(COMPILER_UPPER STREQUAL CXX)
+			macro(VALIDATE_FLAG) 
+				check_cxx_compiler_flag(${ARGV})
+			endmacro()
+		endif()
+
+		foreach(COMPILE_OPTION ${IMPL_OPTIONS})
+			string(REGEX REPLACE "[-=:;/.]" "_" FLAG_SIGNATURE "${COMPILE_OPTION}")
+			set(FLAG_VAR "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG")
 
-        if(COMPILER STREQUAL "c")
-            check_c_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var})
-        elseif(COMPILER STREQUAL "cxx")
-            check_cxx_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var})
-        endif()
+			VALIDATE_FLAG("${COMPILE_OPTION}" "${FLAG_VAR}")
 
-        if(${flag_var})
-            message(STATUS "Enabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects!")
-            set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${NBL_COMPILE_OPTION}" PARENT_SCOPE)
-        else()
-            message(STATUS "Disabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects! (no support)")
-        endif()
+			if(${FLAG_VAR})
+				if(IMPL_CONFIG)
+					foreach(CONFIG ${IMPL_CONFIG})
+						# TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES})
+						string(TOUPPER "${CONFIG}" CONFIG_UPPER)
+						set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE)
+					endforeach()
+				else()
+					set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE)
+				endif()
+			endif()
+		endforeach()
     endforeach()
 endfunction()
 
 option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON)
-option(NBL_REQUEST_SSE_AXV2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON)
+option(NBL_REQUEST_SSE_AVX2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON)
 
 # profiles
 foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX)
diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake
index 62c12075d1..2cc877c028 100644
--- a/cmake/adjust/template/vendor/CXX_Clang.cmake
+++ b/cmake/adjust/template/vendor/CXX_Clang.cmake
@@ -1,51 +1,5 @@
 include_guard(GLOBAL)
 
-# Debug
-set(NBL_CXX_DEBUG_COMPILE_OPTIONS
-	-ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong
-)
-
-# Release
-set(NBL_CXX_RELEASE_COMPILE_OPTIONS
-	-fexpensive-optimizations
-)
-
-# RelWithDebInfo
-set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "")
-
-# Global
-list(APPEND NBL_CXX_COMPILE_OPTIONS
-	-Wextra
-	-fno-strict-aliasing
-	-msse4.2
-	-maes
-	-mfpmath=sse		
-	-Wextra
-	-Wno-sequence-point
-	-Wno-unused-parameter
-	-Wno-unused-but-set-parameter
-	-Wno-c++98-compat
-	-Wno-c++98-compat-pedantic
-	-Wno-padded
-	-Wno-unsafe-buffer-usage
-	-Wno-switch-enum
-	-Wno-error=ignored-attributes
-	-Wno-error=unused-function
-	-Wno-error=unused-variable
-	-Wno-error=unused-parameter
-	-Wno-error=ignored-attributes
-	-Wno-error=non-pod-varargs
-	-fno-exceptions
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=address)
-endif()
-
-if(NBL_SANITIZE_THREAD)
-	list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=thread)
-endif()
-
-# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary
-# TODO: move it out from the profile
-link_libraries(-fuse-ld=gold)
\ No newline at end of file
+set(LANG CXX)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake")
+# append unique CXX options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake
index 1abb66c9da..59f4e59cdd 100644
--- a/cmake/adjust/template/vendor/CXX_MSVC.cmake
+++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake
@@ -1,42 +1,5 @@
 include_guard(GLOBAL)
 
-# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-
-# The default instruction set is SSE2 if no /arch option is specified.
-if(NBL_REQUEST_SSE_4_2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2")
-endif()
-
-# Enables Intel Advanced Vector Extensions 2.
-if(NBL_REQUEST_SSE_AXV2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2")
-endif()
-
-# Debug
-set(NBL_CXX_DEBUG_COMPILE_OPTIONS
-	/Zc:__cplusplus /Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_CXX_DEBUG_COMPILE_OPTIONS /RTC1)
-endif()
-
-# Release
-set(NBL_CXX_RELEASE_COMPILE_OPTIONS
-	/Zc:__cplusplus /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast
-)
-
-# RelWithDebInfo
-set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS
-	/Zc:__cplusplus /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_CXX_COMPILE_OPTIONS /fsanitize=address)
-endif()
-
-# this should also be not part of profile, pasting from old flags-set function temporary
-# TODO: use profile
-
-#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL.
-set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental")
+set(LANG CXX)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake")
+# append unique CXX options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake
index 1c00f78e84..046ccaa902 100644
--- a/cmake/adjust/template/vendor/C_Clang.cmake
+++ b/cmake/adjust/template/vendor/C_Clang.cmake
@@ -1,51 +1,5 @@
 include_guard(GLOBAL)
 
-# Debug
-set(NBL_C_DEBUG_COMPILE_OPTIONS
-	-ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong
-)
-
-# Release
-set(NBL_C_RELEASE_COMPILE_OPTIONS
-	-fexpensive-optimizations
-)
-
-# RelWithDebInfo
-set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "")
-
-# Global
-list(APPEND NBL_C_COMPILE_OPTIONS
-	-Wextra
-	-fno-strict-aliasing
-	-msse4.2
-	-maes
-	-mfpmath=sse		
-	-Wextra
-	-Wno-sequence-point
-	-Wno-unused-parameter
-	-Wno-unused-but-set-parameter
-	-Wno-c++98-compat
-	-Wno-c++98-compat-pedantic
-	-Wno-padded
-	-Wno-unsafe-buffer-usage
-	-Wno-switch-enum
-	-Wno-error=ignored-attributes
-	-Wno-error=unused-function
-	-Wno-error=unused-variable
-	-Wno-error=unused-parameter
-	-Wno-error=ignored-attributes
-	-Wno-error=non-pod-varargs
-	-fno-exceptions
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=address)
-endif()
-
-if(NBL_SANITIZE_THREAD)
-	list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=thread)
-endif()
-
-# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary
-# TODO: move it out from the profile
-link_libraries(-fuse-ld=gold)
\ No newline at end of file
+set(LANG C)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake")
+# append unique C options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/C_MSVC.cmake b/cmake/adjust/template/vendor/C_MSVC.cmake
index ddc0007bb5..f9aca4a5b7 100644
--- a/cmake/adjust/template/vendor/C_MSVC.cmake
+++ b/cmake/adjust/template/vendor/C_MSVC.cmake
@@ -1,44 +1,5 @@
 include_guard(GLOBAL)
 
-# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-
-# The default instruction set is SSE2 if no /arch option is specified.
-if(NBL_REQUEST_SSE_4_2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2")
-endif()
-
-# Enables Intel Advanced Vector Extensions 2.
-if(NBL_REQUEST_SSE_AXV2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2")
-endif()
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(/Zc:preprocessor)
-
-# Debug
-set(NBL_C_DEBUG_COMPILE_OPTIONS
-	/Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_C_DEBUG_COMPILE_OPTIONS /RTC1)
-endif()
-
-# Release
-set(NBL_C_RELEASE_COMPILE_OPTIONS
-	/O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast
-)
-
-# RelWithDebInfo
-set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS
-	/O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_C_COMPILE_OPTIONS /fsanitize=address)
-endif()
-
-# this should also be not part of profile, pasting from old flags-set function temporary
-# TODO: use profile
-
-#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL.
-set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental")
\ No newline at end of file
+set(LANG C)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake")
+# append unique C options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
new file mode 100644
index 0000000000..868309f828
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -0,0 +1,75 @@
+include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake")
+
+# vendor template with options fitting for both C and CXX LANGs
+
+if(NOT DEFINED LANG)
+	message(FATAL_ERROR "LANG must be defined!")
+endif()
+
+if(NBL_REQUEST_SSE_4_2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+		-msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2
+	)
+endif()
+
+if(NBL_REQUEST_SSE_AVX2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+		-mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2
+	)
+endif()
+
+list(APPEND NBL_${LANG}_COMPILE_OPTIONS
+	-Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
+	-maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes
+	-mfpmath=sse # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mfpmath
+
+	# TODO: Yas, eliminate all below
+	-fno-strict-aliasing
+	-Wno-sequence-point
+	-Wno-c++98-compat
+	-Wno-c++98-compat-pedantic
+	-Wno-padded
+	-Wno-unsafe-buffer-usage
+	-Wno-switch-enum
+	-Wno-error=ignored-attributes
+	-Wno-unused-parameter
+	-Wno-unused-but-set-parameter
+	-Wno-error=unused-function
+	-Wno-error=unused-variable
+	-Wno-error=unused-parameter
+	-Wno-error=ignored-attributes
+	-Wno-error=non-pod-varargs
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=address)
+endif()
+
+if(NBL_SANITIZE_THREAD)
+	list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=thread)
+endif()
+
+set(NBL_${LANG}_DEBUG_COMPILE_OPTIONS
+	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
+	-mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+	-fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions
+	-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
+	-fstack-protector-strong # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fstack-protector-strong
+	-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
+	-fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer
+	-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+)
+
+set(NBL_${LANG}_RELEASE_COMPILE_OPTIONS
+	-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
+	-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+)
+
+set(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS 
+	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
+	-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
+	-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+	-fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer
+)
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake
new file mode 100644
index 0000000000..5b73b9073e
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/MSVC.cmake
@@ -0,0 +1,71 @@
+include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake")
+
+# vendor template with options fitting for both C and CXX LANGs
+
+if(NOT DEFINED LANG)
+	message(FATAL_ERROR "LANG must be defined!")
+endif()
+
+if(NBL_REQUEST_SSE_4_2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+		/arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
+	)
+endif()
+
+if(NBL_REQUEST_SSE_AVX2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+		/arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
+	)
+endif()
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS 
+	/Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170
+)
+
+list(APPEND NBL_${LANG}_COMPILE_OPTIONS 
+	/Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
+	/Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170
+	/fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170
+	/MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	list(APPEND NBL_${LANG}_COMPILE_OPTIONS 
+		/fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170
+	)
+
+	list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS 
+		/RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170
+	)
+endif()
+
+list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS
+	/Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170
+	/INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+)
+
+list(APPEND NBL_${LANG}_RELEASE_COMPILE_OPTIONS
+	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
+	/Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
+	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
+	/Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
+	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+	/GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170
+	/GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170
+)
+
+list(APPEND NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS
+	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
+	/Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+	/LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
+	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
+	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
+	/Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
+	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+)
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/reset.cmake b/cmake/adjust/template/vendor/impl/reset.cmake
new file mode 100644
index 0000000000..6eb95b6cfd
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/reset.cmake
@@ -0,0 +1,8 @@
+# reset profile vars, for sanity
+
+foreach(LANG CXX C)
+    unset(NBL_${LANG}_COMPILE_OPTIONS)
+    unset(NBL_${LANG}_RELEASE_COMPILE_OPTIONS)
+    unset(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS)
+    unset(NBL_${LANG}_DEBUG_COMPILE_OPTIONS)
+endforeach()
\ No newline at end of file

From 39bb3e1ba6d46710f8d6a4e98741737da6a2f02f Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 11 Apr 2025 15:40:30 +0200
Subject: [PATCH 010/346] update dxc submodule with fixed clang 19.1.1 build,
 upgrade & correct NBL_REQUEST_COMPILE_OPTION_SUPPORT, add required
 instruction set features for simdjson explicitly; now I hit GLI errors due to
 bad templates

---
 3rdparty/dxc/dxc                              |  2 +-
 cmake/adjust/flags.cmake                      | 26 ++++++++++++++++---
 cmake/adjust/template/vendor/impl/Clang.cmake | 14 ++++++++++
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index b2e75826b7..49b89ae671 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit b2e75826b70d85d03686dd8a755ef477b4fa3807
+Subproject commit 49b89ae6712f74fba2352e099f024724bcc32673
diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index ead5a086e6..6982e0593d 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -15,9 +15,10 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP
 # Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG <LANG;...> CONFIG <CONFIG;...> OPTIONS <OPTIONS;...> )
 # LANG, CONFIG - optional, OPTIONS - required
 function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
-	cmake_parse_arguments(IMPL "" "" "LANG;CONFIG;OPTIONS" ${ARGN})
+	cmake_parse_arguments(IMPL "" "REQUEST_VAR;REQUIRED" "LANG;CONFIG;OPTIONS" ${ARGN})
 
 	set(DEFAULT_COMPILERS c cxx)
+	set(REQUEST_ALL_OPTIONS_PRESENT True)
 
 	if(NOT IMPL_LANG)
         list(APPEND IMPL_LANG ${DEFAULT_COMPILERS})
@@ -51,14 +52,33 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
 					foreach(CONFIG ${IMPL_CONFIG})
 						# TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES})
 						string(TOUPPER "${CONFIG}" CONFIG_UPPER)
-						set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE)
+						set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}")
 					endforeach()
 				else()
-					set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE)
+					set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}")
 				endif()
+			else()
+				if(IMPL_REQUIRED)
+					message(FATAL_ERROR "Terminating, NBL_REQUEST_COMPILE_OPTION_SUPPORT was invoked with REQUIRED qualifier!")
+				endif()
+
+				set(REQUEST_ALL_OPTIONS_PRESENT False)
 			endif()
 		endforeach()
+
+		if(IMPL_CONFIG)
+			foreach(CONFIG ${IMPL_CONFIG})
+				string(TOUPPER "${CONFIG}" CONFIG_UPPER)
+				set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE)
+			endforeach()
+		else()
+			set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE)
+		endif()
     endforeach()
+
+	if(IMPL_REQUEST_VAR)
+		set(${IMPL_REQUEST_VAR} ${REQUEST_ALL_OPTIONS_PRESENT} PARENT_SCOPE)
+	endif()
 endfunction()
 
 option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON)
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index 868309f828..63549974c6 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -18,6 +18,20 @@ if(NBL_REQUEST_SSE_AVX2)
 	)
 endif()
 
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+	# latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson)
+	# TODO: Yas, use with REQUEST_VAR, if the request fail then do not promote simdjson to build with 
+	# HASWELL implementation because those flags + avx2 compose subset it wants in this case
+
+	# also instead of enabling single options maybe we could consider requesting an
+	# instruction implementation set instead, eg -march=haswel, though this approach
+	# could add a few more flags then we actually need while building - to rethink
+
+	-mbmi # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mbmi
+	-mlzcnt # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mlzcnt
+	-mpclmul # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mpclmul
+)
+
 list(APPEND NBL_${LANG}_COMPILE_OPTIONS
 	-Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
 	-maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes

From cbb4db1c448e9e03972ad20bb23d880db4408361 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 11 Apr 2025 18:42:25 +0200
Subject: [PATCH 011/346] update GLI (use custom location for GLM + fix with
 templates) and GLM (to latest and our own fork not mine, this one was 6 years
 old) submodules

---
 .gitmodules             | 6 +++---
 3rdparty/CMakeLists.txt | 9 ++-------
 3rdparty/gli            | 2 +-
 3rdparty/glm            | 2 +-
 src/nbl/CMakeLists.txt  | 2 +-
 5 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 8edc1cead9..caca5b69a1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -27,9 +27,6 @@
 	path = 3rdparty/libexpat
 	url = git@github.com:Devsh-Graphics-Programming/libexpat.git
 	branch = master
-[submodule "3rdparty/glm"]
-	path = 3rdparty/glm
-	url = git@github.com:AnastaZIuk/glm.git
 [submodule "3rdparty/freetype2"]
 	path = 3rdparty/freetype2
 	url = git@github.com:Devsh-Graphics-Programming/freetype.git
@@ -117,3 +114,6 @@
 [submodule "docker/compiler-explorer"]
 	path = docker/compiler-explorer
 	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
+[submodule "3rdparty/glm"]
+	path = 3rdparty/glm
+	url = git@github.com:Devsh-Graphics-Programming/glm.git
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index d838f92127..0335baf7e5 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -231,7 +231,7 @@ if(_NBL_COMPILE_WITH_OPEN_EXR_)
 endif()
 
 
-#gli
+# gli
 option(_NBL_COMPILE_WITH_GLI_ "Build with GLI library" ON)
 if(_NBL_COMPILE_WITH_GLI_)
 	set(_OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
@@ -240,6 +240,7 @@ if(_NBL_COMPILE_WITH_GLI_)
 	set(BUILD_SHARED_LIBS OFF)
 	set(BUILD_STATIC_LIBS OFF)
 	set(BUILD_TESTING OFF)
+	set(GLI_GLM_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/glm")
 	add_subdirectory(gli gli EXCLUDE_FROM_ALL)
 	set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS})
 	set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS})
@@ -419,12 +420,6 @@ add_library(aesGladman OBJECT
 
 add_subdirectory(argparse argparse EXCLUDE_FROM_ALL)
 
-option(GLM_TEST_ENABLE_SIMD_SSE4_2 "Enable SSE 4.2 optimizations" ON)
-option(GLM_TEST_ENABLE "Build unit tests" OFF)
-#add_subdirectory(glm EXCLUDE_FROM_ALL)
-set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS})
-set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS})
-
 if (NBL_BUILD_MITSUBA_LOADER)
    option(BUILD_tools "EXPAT: build the xmlwf tool for expat library" OFF)
    option(BUILD_examples "EXPAT: build the examples for expat library" OFF)
diff --git a/3rdparty/gli b/3rdparty/gli
index 559cbe1ec3..c4e6446d3b 160000
--- a/3rdparty/gli
+++ b/3rdparty/gli
@@ -1 +1 @@
-Subproject commit 559cbe1ec38878e182507d331e0780fbae5baf15
+Subproject commit c4e6446d3b646538026fd5a95533daed952878d4
diff --git a/3rdparty/glm b/3rdparty/glm
index d162eee1e6..2d4c4b4dd3 160000
--- a/3rdparty/glm
+++ b/3rdparty/glm
@@ -1 +1 @@
-Subproject commit d162eee1e6f7c317a09229fe6ceab8ec6ab9a4b4
+Subproject commit 2d4c4b4dd31fde06cfffad7915c2b3006402322f
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 26acb8de10..bde7182ebd 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -308,7 +308,7 @@ endif()
 
 set(COMMON_INCLUDE_DIRS
 	${THIRD_PARTY_SOURCE_DIR}/glm
-        ${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header
+	${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header
 	${CMAKE_BINARY_DIR}/3rdparty/zlib #for dynamically generated zconf.h
 	$<TARGET_PROPERTY:png_static,BINARY_DIR> #for dynamically generated pnglibconf.h
 	$<TARGET_PROPERTY:jpeg-static,BINARY_DIR> #for dynamically generated jconfig.h

From c1cc48b0454b2f5f3d58e6be59fa4ce20fb86717 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Sun, 13 Apr 2025 12:39:20 +0200
Subject: [PATCH 012/346] explicitly set limits for Clang toolset, correct some
 of backend options which must be passed with proxy XClang arg (they were
 ignored before), use NBL_REQUEST_COMPILE_OPTION_SUPPORT for Clang profile
 hence enforce flags validation at configure time (TODO: do the same for
 MSVC). It still crashes at JIT loader's cpp with -1073741819 - windooze
 access violation, I need to attach diagnostic outputs for LLVM team

---
 cmake/adjust/flags.cmake                      |  8 ++-
 cmake/adjust/template/vendor/impl/Clang.cmake | 55 ++++++++++++-------
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index 6982e0593d..1718ac0520 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -15,7 +15,7 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP
 # Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG <LANG;...> CONFIG <CONFIG;...> OPTIONS <OPTIONS;...> )
 # LANG, CONFIG - optional, OPTIONS - required
 function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
-	cmake_parse_arguments(IMPL "" "REQUEST_VAR;REQUIRED" "LANG;CONFIG;OPTIONS" ${ARGN})
+	cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;OPTIONS" ${ARGN})
 
 	set(DEFAULT_COMPILERS c cxx)
 	set(REQUEST_ALL_OPTIONS_PRESENT True)
@@ -43,7 +43,9 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
 
 		foreach(COMPILE_OPTION ${IMPL_OPTIONS})
 			string(REGEX REPLACE "[-=:;/.]" "_" FLAG_SIGNATURE "${COMPILE_OPTION}")
-			set(FLAG_VAR "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG")
+
+			set(TEST_NAME "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG")
+			set(FLAG_VAR ${TEST_NAME})
 
 			VALIDATE_FLAG("${COMPILE_OPTION}" "${FLAG_VAR}")
 
@@ -59,7 +61,7 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
 				endif()
 			else()
 				if(IMPL_REQUIRED)
-					message(FATAL_ERROR "Terminating, NBL_REQUEST_COMPILE_OPTION_SUPPORT was invoked with REQUIRED qualifier!")
+					message(FATAL_ERROR "${TEST_NAME} (a.k.a \"${COMPILE_OPTION}\") failed because its marked as REQUIRED!")
 				endif()
 
 				set(REQUEST_ALL_OPTIONS_PRESENT False)
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index 63549974c6..62c1c2568b 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -9,15 +9,32 @@ endif()
 if(NBL_REQUEST_SSE_4_2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 		-msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2
-	)
+	REQUIRED) # TODO: (****) optional but then adjust 3rdparty options on fail
 endif()
 
 if(NBL_REQUEST_SSE_AVX2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 		-mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2
-	)
+	REQUIRED) # TODO: (****)
 endif()
 
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+	-Xclang=-fconstexpr-backtrace-limit=696969
+	-Xclang=-fconstexpr-depth=696969
+	-Xclang=-fconstexpr-steps=696969
+	-Xclang=-ftemplate-backtrace-limit=0 # no limit
+	-Xclang=-ftemplate-depth=696969
+	-Xclang=-fmacro-backtrace-limit=0 # no limit
+	-Xclang=-fspell-checking-limit=0 # no limit
+	-Xclang=-fcaret-diagnostics-max-lines=0 # no limit
+
+	# whenever clang frontend or backend crashes we put diagnostics into top build direcotry
+	# use it to make a repro and attach to an issue - it outputs preprocessed cpp files with 
+	# sh script for compilation
+	-fcrash-diagnostics=compiler
+	"-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report"
+REQUIRED)
+
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	# latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson)
 	# TODO: Yas, use with REQUEST_VAR, if the request fail then do not promote simdjson to build with 
@@ -27,12 +44,13 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	# instruction implementation set instead, eg -march=haswel, though this approach
 	# could add a few more flags then we actually need while building - to rethink
 
+	################
+	# TODO: (****) ->
 	-mbmi # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mbmi
 	-mlzcnt # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mlzcnt
 	-mpclmul # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mpclmul
-)
+	################ <-
 
-list(APPEND NBL_${LANG}_COMPILE_OPTIONS
 	-Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
 	-maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes
 	-mfpmath=sse # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mfpmath
@@ -53,37 +71,34 @@ list(APPEND NBL_${LANG}_COMPILE_OPTIONS
 	-Wno-error=unused-parameter
 	-Wno-error=ignored-attributes
 	-Wno-error=non-pod-varargs
-)
+REQUIRED)
 
 if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=address)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address REQUIRED)
 endif()
 
 if(NBL_SANITIZE_THREAD)
-	list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=thread)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=thread)
 endif()
 
-set(NBL_${LANG}_DEBUG_COMPILE_OPTIONS
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS
 	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
 	-mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-	-fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions
+	-Xclang=-fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions
 	-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
-	-fstack-protector-strong # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fstack-protector-strong
 	-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
-	-fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer
-	-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
-)
+	-Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+REQUIRED)
 
-set(NBL_${LANG}_RELEASE_COMPILE_OPTIONS
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE OPTIONS
 	-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
-	-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+	-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
 	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-)
+REQUIRED)
 
-set(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS 
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO OPTIONS
 	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
 	-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
-	-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+	-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
 	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-	-fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer
-)
\ No newline at end of file
+REQUIRED)
\ No newline at end of file

From 16088b980f69b9c13c973b98e28459a48a10abf2 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Mon, 14 Apr 2025 10:31:47 +0200
Subject: [PATCH 013/346] Reduce device_capabilities_traits_jit.h instructions
 & use std::ostringstream for generated line, make it build with Clang(CL)
 19.1.1

---
 src/nbl/device/DeviceGen.py         | 4 ++--
 src/nbl/device/gen.py               | 2 +-
 src/nbl/video/CJITIncludeLoader.cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nbl/device/DeviceGen.py b/src/nbl/device/DeviceGen.py
index 288732de9b..9ad485fc84 100644
--- a/src/nbl/device/DeviceGen.py
+++ b/src/nbl/device/DeviceGen.py
@@ -562,7 +562,7 @@ def buildTraitsHeader(**params):
         res.append(emptyline)
 
     if 'enable_jit' in params and params['enable_jit']:
-        res.append("std::string jit_traits = R\"===(")
+        res.append("std::ostringstream oss;")
 
     buildTraitsHeaderHelper(
         res,
@@ -582,7 +582,7 @@ def buildTraitsHeader(**params):
     )
 
     if 'enable_jit' in params and params['enable_jit']:
-        res.append(")===\";")
+        res.append("std::string jit_traits = oss.str();")
 
     return res
 
diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py
index b910d1aa8f..253d529b3d 100644
--- a/src/nbl/device/gen.py
+++ b/src/nbl/device/gen.py
@@ -120,7 +120,7 @@
         args.jit_traits_output_path,
         buildTraitsHeader,
         type="JIT Members",
-        template="NBL_CONSTEXPR_STATIC_INLINE {} {} = )===\" + std::string(\"({})\") + CJITIncludeLoader::to_string({}.{}) + R\"===(;",
+        template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{});",
         limits_json=limits,
         features_json=features,
         format_params=["type", "name", "type", "json_type", "cpp_name"],
diff --git a/src/nbl/video/CJITIncludeLoader.cpp b/src/nbl/video/CJITIncludeLoader.cpp
index edab1c046a..a9f27e5afd 100644
--- a/src/nbl/video/CJITIncludeLoader.cpp
+++ b/src/nbl/video/CJITIncludeLoader.cpp
@@ -49,4 +49,4 @@ std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& li
 
     return start + jit_traits + end;
 }
-}
\ No newline at end of file
+}

From 8f454a98a6b037b6e06f715248d03a2c84de5af5 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Mon, 14 Apr 2025 14:47:34 +0200
Subject: [PATCH 014/346] update bzip2 submodule to latest *official* revision,
 adjust build system + apply workaround for CLang(CL) 19.1.1 due to error :
 use of undeclared label "errhandler"; for some reason if in single
 translation unit we have identical label names (goto) in separate function
 bodies we hit this error

---
 3rdparty/CMakeLists.txt | 23 +++++++++++------------
 3rdparty/bzip2          |  2 +-
 src/nbl/CMakeLists.txt  |  8 +++++++-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 0335baf7e5..ffbf8e4cbd 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -247,6 +247,16 @@ if(_NBL_COMPILE_WITH_GLI_)
 	set(BUILD_TESTING ${_OLD_BUILD_TESTING})
 endif()
 
+set(ENABLE_STATIC_LIB ON)
+set(ENABLE_SHARED_LIB OFF)
+set(ENABLE_EXAMPLES OFF)
+set(ENABLE_DOCS OFF)
+set(ENABLE_APP OFF)
+set(ENABLE_LIB_ONLY ON)
+set(ENABLE_TESTS OFF)
+set(ENABLE_SUMMARY OFF)
+add_subdirectory(bzip2 bzip2 EXCLUDE_FROM_ALL)
+
 add_library(lzma OBJECT
 	lzma/C/Alloc.c
 	lzma/C/LzFind.c
@@ -263,17 +273,6 @@ add_library(lz4 OBJECT
 	lz4/lib/xxhash.c
 )
 
-
-add_library(bzip2 OBJECT
-	bzip2/blocksort.c
-	bzip2/bzlib.c
-	bzip2/compress.c
-	bzip2/crctable.c
-	bzip2/decompress.c
-	bzip2/huffman.c
-	bzip2/randtable.c
-)
-
 add_library(spirv_cross OBJECT
 	nbl_spirv_cross/spirv_cfg.cpp
 	nbl_spirv_cross/spirv_cross.cpp
@@ -460,7 +459,7 @@ set(NBL_3RDPARTY_TARGETS
 				shaderc_util
 				shaderc
 				jpeg-static
-				bzip2
+				bz2_static
 				simdjson
 				nlohmann_json
 				glslang
diff --git a/3rdparty/bzip2 b/3rdparty/bzip2
index c4a14bb87e..f4301b0eac 160000
--- a/3rdparty/bzip2
+++ b/3rdparty/bzip2
@@ -1 +1 @@
-Subproject commit c4a14bb87ee395fb2c69ef5dbb50762fe862517e
+Subproject commit f4301b0eac69eb109c5419813102be6f82d2b73a
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index bde7182ebd..0f0e4867b5 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -324,7 +324,6 @@ set(NBL_LIBRARY_CREATION_SOURCES
 	${NABLA_SRCS_COMMON}
 	${NABLA_HEADERS}
 	$<TARGET_OBJECTS:aesGladman>
-	$<TARGET_OBJECTS:bzip2>
 	$<TARGET_OBJECTS:lz4>
 	$<TARGET_OBJECTS:lzma>
 	$<TARGET_OBJECTS:spirv_cross>
@@ -391,6 +390,13 @@ if(_NBL_BUILD_DPL_)
 	target_link_libraries(Nabla INTERFACE tbb tbbmalloc tbbmalloc_proxy)
 endif()
 
+# bzip2
+if(NBL_STATIC_BUILD)
+	target_link_libraries(Nabla INTERFACE bz2_static)
+else()
+	target_link_libraries(Nabla PRIVATE bz2_static)
+endif()
+
 # boost
 target_include_directories(Nabla PUBLIC "${BOOST_PREPROCESSOR_INCLUDE}")
 

From b4e722a4709af985b1f18ec1d3a35b90663bba46 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Mon, 14 Apr 2025 16:14:14 +0200
Subject: [PATCH 015/346] remove `-Xclang=-fincremental-extensions` which
 causes funny compile errors with goto statements
 (https://github.com/Devsh-Graphics-Programming/Nabla/commit/8f454a98a6b037b6e06f715248d03a2c84de5af5)

---
 cmake/adjust/template/vendor/impl/Clang.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index 62c1c2568b..1c3581d425 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -84,7 +84,6 @@ endif()
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS
 	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
 	-mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-	-Xclang=-fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions
 	-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
 	-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
 	-Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions

From eda05ee269c7be25c8cadfc1c82d459a86e2f692 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Tue, 15 Apr 2025 19:54:26 +0200
Subject: [PATCH 016/346] adjust MSVC profile + correct incremental link
 options, update NBL_REQUEST_COMPILE_OPTION_SUPPORT & build system to
 correctly handle compile & link options, validate build options at configure
 time

---
 cmake/adjust/flags.cmake                      | 207 +++++++++++-------
 cmake/adjust/template/vendor/impl/Clang.cmake |  16 +-
 cmake/adjust/template/vendor/impl/MSVC.cmake  |  37 ++--
 cmake/adjust/template/vendor/impl/reset.cmake |  12 +-
 src/nbl/CMakeLists.txt                        |   1 +
 5 files changed, 162 insertions(+), 111 deletions(-)

diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index 1718ac0520..d8519aea07 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -12,10 +12,13 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP
   BRIEF_DOCS "Stores configuration map for a target, it will evaluate to the configuration it's mapped to"
 )
 
-# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG <LANG;...> CONFIG <CONFIG;...> OPTIONS <OPTIONS;...> )
-# LANG, CONFIG - optional, OPTIONS - required
+# https://github.com/Kitware/CMake/blob/05e77b8a27033e6fd086456bd6cef28338ff1474/Modules/Internal/CheckCompilerFlag.cmake#L26C7-L26C42
+# must be cached because parse utility clears locals in the CheckCompilerFlag module
+set(CHECK_COMPILER_FLAG_OUTPUT_VARIABLE NBL_COMPILER_FLAG_OUTPUT CACHE INTERNAL "")
+
+# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG <LANG;...> CONFIG <CONFIG;...> COMPILE_OPTIONS <OPTIONS;...> LINK_OPTIONS <OPTIONS;...>)
 function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
-	cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;OPTIONS" ${ARGN})
+	cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN})
 
 	set(DEFAULT_COMPILERS c cxx)
 	set(REQUEST_ALL_OPTIONS_PRESENT True)
@@ -24,63 +27,39 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
         list(APPEND IMPL_LANG ${DEFAULT_COMPILERS})
     endif()
 
-    if(NOT IMPL_OPTIONS)
-        message(FATAL_ERROR "NBL_REQUEST_COMPILE_OPTION_SUPPORT's OPTIONS empty!")
-    endif()
-
     foreach(COMPILER IN ITEMS ${IMPL_LANG})
         string(TOUPPER "${COMPILER}" COMPILER_UPPER)
 
-		if(COMPILER_UPPER STREQUAL C)
-			macro(VALIDATE_FLAG) 
-				check_c_compiler_flag(${ARGV})
-			endmacro()
-		elseif(COMPILER_UPPER STREQUAL CXX)
-			macro(VALIDATE_FLAG) 
-				check_cxx_compiler_flag(${ARGV})
-			endmacro()
-		endif()
-
-		foreach(COMPILE_OPTION ${IMPL_OPTIONS})
-			string(REGEX REPLACE "[-=:;/.]" "_" FLAG_SIGNATURE "${COMPILE_OPTION}")
-
-			set(TEST_NAME "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG")
-			set(FLAG_VAR ${TEST_NAME})
+		foreach(WHAT_OPTIONS IN ITEMS IMPL_COMPILE_OPTIONS IMPL_LINK_OPTIONS)
+		    if(NOT ${WHAT_OPTIONS})
+				continue()
+			endif()
 
-			VALIDATE_FLAG("${COMPILE_OPTION}" "${FLAG_VAR}")
+			set(IMPL_OPTIONS ${${WHAT_OPTIONS}})
+			string(REPLACE IMPL_ "" WHAT_OPTIONS "${WHAT_OPTIONS}")
 
-			if(${FLAG_VAR})
+			foreach(COMPILE_OPTION ${IMPL_OPTIONS})
 				if(IMPL_CONFIG)
 					foreach(CONFIG ${IMPL_CONFIG})
 						# TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES})
 						string(TOUPPER "${CONFIG}" CONFIG_UPPER)
-						set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}")
+						set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}")
 					endforeach()
 				else()
-					set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}")
-				endif()
-			else()
-				if(IMPL_REQUIRED)
-					message(FATAL_ERROR "${TEST_NAME} (a.k.a \"${COMPILE_OPTION}\") failed because its marked as REQUIRED!")
+					set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}")
 				endif()
+			endforeach()
 
-				set(REQUEST_ALL_OPTIONS_PRESENT False)
+			if(IMPL_CONFIG)
+				foreach(CONFIG ${IMPL_CONFIG})
+					string(TOUPPER "${CONFIG}" CONFIG_UPPER)
+					set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE)
+				endforeach()
+			else()
+				set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE)
 			endif()
 		endforeach()
-
-		if(IMPL_CONFIG)
-			foreach(CONFIG ${IMPL_CONFIG})
-				string(TOUPPER "${CONFIG}" CONFIG_UPPER)
-				set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE)
-			endforeach()
-		else()
-			set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE)
-		endif()
     endforeach()
-
-	if(IMPL_REQUEST_VAR)
-		set(${IMPL_REQUEST_VAR} ${REQUEST_ALL_OPTIONS_PRESENT} PARENT_SCOPE)
-	endif()
 endfunction()
 
 option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON)
@@ -101,42 +80,104 @@ foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX)
         continue()
     endif()
 
-    # a profile MUST define 
-        # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS" (configuration dependent)
-        # - "NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS" (global)
+	# a profile MUST define 
 
-    # a profile MUST NOT define
-        # - NBL_COMPILE_OPTIONS
+    # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS" (configuration dependent)
+    # - "NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS" (global)
 
-    set(NBL_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS)
-    set(NBL_COMPILE_OPTIONS_VAR_VALUE ${${NBL_COMPILE_OPTIONS_VAR_NAME}})
+	# a profile MUST NOT define
+		# - NBL_${WHAT}_OPTIONS
+		
+	# note: 
+	# - use NBL_REQUEST_COMPILE_OPTION_SUPPORT in profile to creates those vars
+	# - include reset utility in profiles to init vars with empty lists
 
-    if(NOT DEFINED ${NBL_COMPILE_OPTIONS_VAR_NAME})
-        message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_COMPILE_OPTIONS_VAR_NAME}\"!")
-    endif()
+	# TODO: DEFINITIONS for WHAT to unify the API
 
-    # update map with configuration dependent compile options
-    foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG)
-        set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS)
-        set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}})
+	foreach(WHAT COMPILE LINK)
+		set(NBL_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS)
+		set(NBL_OPTIONS_VAR_VALUE ${${NBL_OPTIONS_VAR_NAME}})
 
-        if(NOT DEFINED ${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME})
-            message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}\"!")
-        endif()
+		if(NOT DEFINED ${NBL_OPTIONS_VAR_NAME})
+			message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_OPTIONS_VAR_NAME}\"!")
+		endif()
 
-        list(APPEND NBL_${CONFIGURATION}_COMPILE_OPTIONS
-            # note that "${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}" MUST NOT contain ANY 
-            # $<$<CONFIG:<>> generator expression in order to support our configuration mapping features
-            $<$<COMPILE_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}>
-        )
+		# update map with configuration dependent compile options
+		foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG)
+			set(NBL_CONFIGURATION_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS)
+			set(NBL_CONFIGURATION_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_OPTIONS_VAR_NAME}})
 
-        set(NBL_${CONFIGURATION}_COMPILE_OPTIONS  ${NBL_${CONFIGURATION}_COMPILE_OPTIONS})
-    endforeach()
+			if(NOT DEFINED ${NBL_CONFIGURATION_OPTIONS_VAR_NAME})
+				message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_OPTIONS_VAR_NAME}\"!")
+			endif()
+
+			set(NBL_${CONFIGURATION}_${WHAT}_OPTIONS ${NBL_${CONFIGURATION}_${WHAT}_OPTIONS}
+				# note that "${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}" MUST NOT contain ANY 
+				# $<$<CONFIG:<>> generator expression in order to support our configuration mapping features
+				$<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}>
+			)
+		endforeach()
+
+		# update map with global compile options
+		set(NBL_${WHAT}_OPTIONS ${NBL_${WHAT}_OPTIONS}
+			$<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS}>
+		)
+	endforeach()
+
+	block()
+		# validate build with a vendor profile, any warning diagnostic = error
+		# if you hit error it means the profile generates diagnostics due to:
+		# - an option (compile or link) which doesn't exist (typo? check vendor docs)
+		# - a set of options which invalidates an option (eg. MSVC's /INCREMENTAL with /LTCG:incremental is invalid, however linker will emit a warning by default + do a fall-back)
+		# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html#variable:CMAKE_%3CLANG%3E_FLAGS
+		# https://cmake.org/cmake/help/latest/module/CheckCompilerFlag.html#command:check_compiler_flag
+
+		set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS)
+
+		foreach(CONFIGURATION IN ITEMS Release RelWithDebInfo Debug)
+			set(CMAKE_TRY_COMPILE_CONFIGURATION ${CONFIGURATION})
+			string(TOUPPER "${CONFIGURATION}" CONFIGURATION)
+
+			set(TEST_NAME "NBL_${NBL_COMPILER_LANGUAGE}_LANG_${CONFIGURATION}_BUILD_OPTIONS_SUPPORT")
+			set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS_${CONFIGURATION})
+
+			set(COMPILE_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS} ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS})
+			set(LINK_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_LINK_OPTIONS})
+			set(COMBINED ${COMPILE_OPTIONS} ${LINK_OPTIONS})
+
+			set(NBL_OUTPUT_FILE "${CMAKE_BINARY_DIR}/.nbl/try-compile/${TEST_NAME}.output") # no hash in output diagnostic file, desired
+			
+			string(SHA1 OPTIONS_HASH "${COMBINED}")
+			string(APPEND TEST_NAME "_HASH_${OPTIONS_HASH}")
+
+			set(FLAG_VAR ${TEST_NAME})
+			set(CMAKE_REQUIRED_LINK_OPTIONS ${LINK_OPTIONS})
+			string(REPLACE ";" " " CLI_COMPILE_OPTIONS "${COMPILE_OPTIONS}")
+
+			if(NBL_COMPILER_LANGUAGE STREQUAL C)
+				check_c_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}")
+			elseif(NBL_COMPILER_LANGUAGE STREQUAL CXX)
+				check_cxx_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}")
+			endif()
+
+			if(NOT ${FLAG_VAR})
+				if(NOT "${NBL_COMPILER_FLAG_OUTPUT}" STREQUAL "")
+					file(WRITE "${NBL_OUTPUT_FILE}" "${NBL_COMPILER_FLAG_OUTPUT}") # lock into file, do not cache, must read from the file because of NBL_COMPILER_FLAG_OUTPUT availability (CMake module writes an output only once before a signature flag status is created)
+				endif()
 
-    # update map with global compile options
-    list(APPEND NBL_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS}>)
+				if(EXISTS "${NBL_OUTPUT_FILE}")
+					file(READ "${NBL_OUTPUT_FILE}" NBL_DIAGNOSTICS)
+					set(NBL_DIAGNOSTICS "Diagnostics:\n${NBL_DIAGNOSTICS}")
+				else()
+					set(NBL_DIAGNOSTICS)
+				endif()
 
-    set(NBL_COMPILE_OPTIONS ${NBL_COMPILE_OPTIONS})
+				if(NOT DEFINED NBL_SKIP_BUILD_OPTIONS_VALIDATION)
+					message(FATAL_ERROR "${TEST_NAME} failed! To skip the validation define \"NBL_SKIP_BUILD_OPTIONS_VALIDATION\". ${NBL_DIAGNOSTICS}")
+				endif()
+			endif()
+		endforeach()
+	endblock()
 endforeach()
 
 function(NBL_EXT_P_APPEND_COMPILE_OPTIONS NBL_LIST_NAME MAP_RELEASE MAP_RELWITHDEBINFO MAP_DEBUG)		
@@ -240,23 +281,27 @@ function(nbl_adjust_flags)
 
 			# global compile options
 			list(APPEND _D_NBL_COMPILE_OPTIONS_ ${NBL_COMPILE_OPTIONS})
-			
-			# per configuration compile options with mapping
-			list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$<CONFIG:Debug>:${NBL_${NBL_MAP_DEBUG_ITEM_U}_COMPILE_OPTIONS}>)
-			list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$<CONFIG:Release>:${NBL_${NBL_MAP_RELEASE_ITEM_U}_COMPILE_OPTIONS}>)
-			list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$<CONFIG:RelWithDebInfo>:${NBL_${NBL_MAP_RELWITHDEBINFO_ITEM_U}_COMPILE_OPTIONS}>)
-			
-			# configuration mapping properties
-			string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:Debug>:${NBL_MAP_DEBUG_ITEM_U}>)
-			string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:Release>:${NBL_MAP_RELEASE_ITEM_U}>)
-			string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:RelWithDebInfo>:${NBL_MAP_RELWITHDEBINFO_ITEM_U}>)
+
+			foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
+				string(TOUPPER "${CONFIG}" CONFIG_U)
+
+				# per configuration options with mapping
+				foreach(WHAT COMPILE LINK)
+					list(APPEND _D_NBL_${WHAT}_OPTIONS_ $<$<CONFIG:${CONFIG}>:${NBL_${NBL_MAP_${CONFIG_U}_ITEM_U}_${WHAT}_OPTIONS}>)
+				endforeach()
+
+				# configuration mapping properties
+				string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:${CONFIG}>:${NBL_MAP_${CONFIG_U}_ITEM_U}>)
+			endforeach()
 			
 			set_target_properties(${NBL_TARGET_ITEM} PROPERTIES
 				NBL_CONFIGURATION_MAP "${_D_NBL_CONFIGURATION_MAP_}"
 				COMPILE_OPTIONS "${_D_NBL_COMPILE_OPTIONS_}"
+				LINK_OPTIONS "${_D_NBL_LINK_OPTIONS_}"
 			)
 			unset(_D_NBL_CONFIGURATION_MAP_)
 			unset(_D_NBL_COMPILE_OPTIONS_)
+			unset(_D_NBL_LINK_OPTIONS_)
 			
 			set(MAPPED_CONFIG $<TARGET_GENEX_EVAL:${NBL_TARGET_ITEM},$<TARGET_PROPERTY:${NBL_TARGET_ITEM},NBL_CONFIGURATION_MAP>>)
 			
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index 1c3581d425..9f9f432e98 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -9,13 +9,13 @@ endif()
 if(NBL_REQUEST_SSE_4_2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 		-msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2
-	REQUIRED) # TODO: (****) optional but then adjust 3rdparty options on fail
+) # TODO: (****) optional but then adjust 3rdparty options on fail
 endif()
 
 if(NBL_REQUEST_SSE_AVX2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 		-mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2
-	REQUIRED) # TODO: (****)
+) # TODO: (****)
 endif()
 
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
@@ -33,7 +33,7 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	# sh script for compilation
 	-fcrash-diagnostics=compiler
 	"-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report"
-REQUIRED)
+)
 
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	# latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson)
@@ -71,10 +71,10 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	-Wno-error=unused-parameter
 	-Wno-error=ignored-attributes
 	-Wno-error=non-pod-varargs
-REQUIRED)
+)
 
 if(NBL_SANITIZE_ADDRESS)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address REQUIRED)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address)
 endif()
 
 if(NBL_SANITIZE_THREAD)
@@ -87,17 +87,17 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS
 	-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
 	-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
 	-Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
-REQUIRED)
+)
 
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE OPTIONS
 	-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
 	-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
 	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-REQUIRED)
+)
 
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO OPTIONS
 	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
 	-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
 	-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
 	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-REQUIRED)
\ No newline at end of file
+)
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake
index 5b73b9073e..62129690f9 100644
--- a/cmake/adjust/template/vendor/impl/MSVC.cmake
+++ b/cmake/adjust/template/vendor/impl/MSVC.cmake
@@ -7,22 +7,19 @@ if(NOT DEFINED LANG)
 endif()
 
 if(NBL_REQUEST_SSE_4_2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
 		/arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-	)
+) # TODO: (****) optional but then adjust 3rdparty options on fail
 endif()
 
 if(NBL_REQUEST_SSE_AVX2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
 		/arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-	)
+) # TODO: (****) optional but then adjust 3rdparty options on fail
 endif()
 
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS 
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
 	/Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170
-)
-
-list(APPEND NBL_${LANG}_COMPILE_OPTIONS 
 	/Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
 	/Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170
 	/fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170
@@ -30,42 +27,48 @@ list(APPEND NBL_${LANG}_COMPILE_OPTIONS
 )
 
 if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_${LANG}_COMPILE_OPTIONS 
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
 		/fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170
 	)
 
-	list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS 
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
 		/RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170
 	)
 endif()
 
-list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
 	/Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
 	/Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170
-	/INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
 	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
 )
 
-list(APPEND NBL_${LANG}_RELEASE_COMPILE_OPTIONS
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
 	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
 	/Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
-	/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
 	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
 	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
 	/Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
 	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
 	/GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170
 	/GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
 )
 
-list(APPEND NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
 	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
 	/Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
-	/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
-	/LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
 	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
 	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
 	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
 	/Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
 	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use with /LTCG:incremental)
+		/LTCG:incremental  # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
 )
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/reset.cmake b/cmake/adjust/template/vendor/impl/reset.cmake
index 6eb95b6cfd..fc1230f326 100644
--- a/cmake/adjust/template/vendor/impl/reset.cmake
+++ b/cmake/adjust/template/vendor/impl/reset.cmake
@@ -1,8 +1,10 @@
-# reset profile vars, for sanity
+# init profiles vars by resetting required lists
 
 foreach(LANG CXX C)
-    unset(NBL_${LANG}_COMPILE_OPTIONS)
-    unset(NBL_${LANG}_RELEASE_COMPILE_OPTIONS)
-    unset(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS)
-    unset(NBL_${LANG}_DEBUG_COMPILE_OPTIONS)
+    foreach(WHAT COMPILE LINK DEFINITIONS)
+        set(NBL_${LANG}_${WHAT}_OPTIONS "")
+        foreach(CONFIG RELEASE RELWITHDEBINFO DEBUG)
+            set(NBL_${LANG}_${CONFIG}_${WHAT}_OPTIONS "")
+        endforeach()
+    endforeach()
 endforeach()
\ No newline at end of file
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 0f0e4867b5..98c7620159 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -396,6 +396,7 @@ if(NBL_STATIC_BUILD)
 else()
 	target_link_libraries(Nabla PRIVATE bz2_static)
 endif()
+add_dependencies(Nabla bz2_static)
 
 # boost
 target_include_directories(Nabla PUBLIC "${BOOST_PREPROCESSOR_INCLUDE}")

From 25e0120e49206a8585da7d38d174bb153e52203f Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Wed, 16 Apr 2025 11:03:32 +0200
Subject: [PATCH 017/346] get rid of MSVC Release linker fallback with /LTCG
 due to /GL - manually specify the flag

---
 cmake/adjust/template/vendor/impl/MSVC.cmake | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake
index 62129690f9..b1b6b01a99 100644
--- a/cmake/adjust/template/vendor/impl/MSVC.cmake
+++ b/cmake/adjust/template/vendor/impl/MSVC.cmake
@@ -9,13 +9,13 @@ endif()
 if(NBL_REQUEST_SSE_4_2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
 		/arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-) # TODO: (****) optional but then adjust 3rdparty options on fail
+) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
 endif()
 
 if(NBL_REQUEST_SSE_AVX2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
 		/arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-) # TODO: (****) optional but then adjust 3rdparty options on fail
+) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
 endif()
 
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
@@ -57,6 +57,7 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
 
 	LINK_OPTIONS
 		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+		/LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG)
 )
 
 NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
@@ -69,6 +70,6 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OP
 	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
 
 	LINK_OPTIONS
-		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use with /LTCG:incremental)
-		/LTCG:incremental  # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback)
+		/LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
 )
\ No newline at end of file

From b5d6795e293eba4e6c4e3cf658aa1d0178a03248 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Thu, 17 Apr 2025 17:05:09 +0200
Subject: [PATCH 018/346] Create vendor/template/frontend/MSVC.cmake, update
 profiles, respect CMAKE_<LANG>_COMPILER_FRONTEND_VARIANT, fix issues with
 /DELAYLOAD & debug info format for ClangCL, use MSVC-frontend checking logic,
 inherit default MSVC frontend options in Clang profile if using Windows'
 ClangCL; upgrade minimum CMake version to 3.31

---
 CMakeLists.txt                                |   2 +-
 cmake/adjust/template/vendor/impl/Clang.cmake | 111 ++++++++++--------
 cmake/adjust/template/vendor/impl/MSVC.cmake  |  69 +----------
 .../template/vendor/impl/frontend/MSVC.cmake  |  68 +++++++++++
 cmake/common.cmake                            |   8 --
 src/nbl/CMakeLists.txt                        |   7 ++
 6 files changed, 138 insertions(+), 127 deletions(-)
 create mode 100644 cmake/adjust/template/vendor/impl/frontend/MSVC.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 571743f5b0..3c5fa8da4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # This file is part of the "Nabla Engine".
 # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h
 
-cmake_minimum_required(VERSION 3.29)
+cmake_minimum_required(VERSION 3.31)
 cmake_policy(SET CMP0112 NEW)
 cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141
 cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index 9f9f432e98..4002bc4f65 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -6,19 +6,16 @@ if(NOT DEFINED LANG)
 	message(FATAL_ERROR "LANG must be defined!")
 endif()
 
-if(NBL_REQUEST_SSE_4_2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
-		-msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2
-) # TODO: (****) optional but then adjust 3rdparty options on fail
-endif()
-
-if(NBL_REQUEST_SSE_AVX2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
-		-mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2
-) # TODO: (****)
+if(NBL_WITH_COMPILER_CRASH_DIAGNOSTICS)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		# use it to make a repro and attach to an issue if you Clang crashes
+		# - it outputs preprocessed cpp files with sh script for compilation
+		-fcrash-diagnostics=compiler
+		-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report
+	)
 endif()
 
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
 	-Xclang=-fconstexpr-backtrace-limit=696969
 	-Xclang=-fconstexpr-depth=696969
 	-Xclang=-fconstexpr-steps=696969
@@ -28,21 +25,10 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	-Xclang=-fspell-checking-limit=0 # no limit
 	-Xclang=-fcaret-diagnostics-max-lines=0 # no limit
 
-	# whenever clang frontend or backend crashes we put diagnostics into top build direcotry
-	# use it to make a repro and attach to an issue - it outputs preprocessed cpp files with 
-	# sh script for compilation
-	-fcrash-diagnostics=compiler
-	"-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report"
-)
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	# latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson)
-	# TODO: Yas, use with REQUEST_VAR, if the request fail then do not promote simdjson to build with 
-	# HASWELL implementation because those flags + avx2 compose subset it wants in this case
-
-	# also instead of enabling single options maybe we could consider requesting an
-	# instruction implementation set instead, eg -march=haswel, though this approach
-	# could add a few more flags then we actually need while building - to rethink
+	# TODO: Yas, we should first do independent check if host has the flags, if the request fail then 
+	# do not promote simdjson to build with HASWELL implementation because those flags + avx2 compose 
+	# subset it wants in this case
 
 	################
 	# TODO: (****) ->
@@ -73,31 +59,54 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS
 	-Wno-error=non-pod-varargs
 )
 
-if(NBL_SANITIZE_ADDRESS)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address)
-endif()
-
-if(NBL_SANITIZE_THREAD)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=thread)
+if(NBL_REQUEST_SSE_4_2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		-msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2
+) # TODO: (****) optional but then adjust 3rdparty options on fail
 endif()
 
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS
-	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
-	-mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-	-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
-	-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
-	-Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
-)
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE OPTIONS
-	-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
-	-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
-	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-)
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO OPTIONS
-	-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
-	-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
-	-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
-	-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
-)
\ No newline at end of file
+if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES MSVC)
+	# ClangCL with MSVC frontend (most of the options are compatible but eg /arch:SSE4.2 seems to be not)
+	include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake")
+
+	# https://cmake.org/cmake/help/latest/variable/CMAKE_MSVC_DEBUG_INFORMATION_FORMAT.html
+	# should be set with CMAKE_MSVC_DEBUG_INFORMATION_FORMAT but for some reason it doesn't respect with ClangCL even though its MSVC frontend
+	# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG RELWITHDEBINFO COMPILE_OPTIONS /Zi)
+	return()
+else()
+	if(NBL_REQUEST_SSE_AVX2)
+		NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+			-mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2
+	) # TODO: (****)
+	endif()
+
+	if(NBL_SANITIZE_ADDRESS)
+		NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=address)
+	endif()
+
+	if(NBL_SANITIZE_THREAD)
+		NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=thread)
+	endif()
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
+		-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
+		-mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+		-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
+		-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
+		-Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+	)
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
+		-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
+		-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+		-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+	)
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
+		-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
+		-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
+		-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+		-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+	)
+endif()
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake
index b1b6b01a99..803adb1754 100644
--- a/cmake/adjust/template/vendor/impl/MSVC.cmake
+++ b/cmake/adjust/template/vendor/impl/MSVC.cmake
@@ -1,75 +1,10 @@
 include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake")
 
 # vendor template with options fitting for both C and CXX LANGs
 
-if(NOT DEFINED LANG)
-	message(FATAL_ERROR "LANG must be defined!")
-endif()
-
 if(NBL_REQUEST_SSE_4_2)
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
 		/arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
 ) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
-endif()
-
-if(NBL_REQUEST_SSE_AVX2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
-		/arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
-endif()
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
-	/Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170
-	/Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
-	/Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170
-	/fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170
-	/MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
-		/fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170
-	)
-
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
-		/RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170
-	)
-endif()
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
-	/Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
-	/Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170
-	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
-
-	LINK_OPTIONS
-		/INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
-)
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
-	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
-	/Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
-	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
-	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
-	/Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
-	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
-	/GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170
-	/GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170
-
-	LINK_OPTIONS
-		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
-		/LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG)
-)
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
-	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
-	/Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
-	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
-	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
-	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
-	/Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
-	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
-
-	LINK_OPTIONS
-		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback)
-		/LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
-)
\ No newline at end of file
+endif()
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake
new file mode 100644
index 0000000000..06ab606104
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake
@@ -0,0 +1,68 @@
+# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT
+# vendor frontend template with options fitting for both C and CXX LANGs
+
+if(NOT DEFINED LANG)
+	message(FATAL_ERROR "LANG must be defined!")
+endif()
+
+if(NBL_REQUEST_SSE_AVX2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		/arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
+) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
+endif()
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
+	/Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170
+	/Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
+	/Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170
+	/fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170
+	/MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
+		/fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170
+	)
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
+		/RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170
+	)
+endif()
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
+	/Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170
+	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+)
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
+	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
+	/Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
+	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
+	/Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
+	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+	/GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170
+	/GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+		/LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG)
+)
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
+	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
+	/Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
+	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
+	/Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
+	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback)
+		/LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
+)
\ No newline at end of file
diff --git a/cmake/common.cmake b/cmake/common.cmake
index 69b915bbc7..69a0a5b980 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -69,14 +69,6 @@ macro(nbl_create_executable_project _EXTRA_SOURCES _EXTRA_OPTIONS _EXTRA_INCLUDE
 		
 		add_executable(${EXECUTABLE_NAME} ${NBL_EXECUTABLE_SOURCES})
 		nbl_handle_runtime_lib_properties(${EXECUTABLE_NAME})
-		
-		if(WIN32 AND MSVC)
-			if(NBL_COMPILER_DYNAMIC_RUNTIME)
-				target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:$<TARGET_FILE_NAME:Nabla>")
-			endif()
-			
-			target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:dxcompiler.dll")
-		endif()
 	endif()
 	
 	nbl_handle_dll_definitions(${EXECUTABLE_NAME} PUBLIC)
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 98c7620159..ad5aa7c463 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -358,6 +358,13 @@ endif()
 
 target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__)
 
+target_link_options(Nabla INTERFACE # proxy to downstream targets
+	$<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:
+		$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:/DELAYLOAD:$<TARGET_FILE_NAME:Nabla>>
+		/DELAYLOAD:dxcompiler.dll
+	>
+)
+
 if (ANDROID)
 	add_library(android_native_app_glue STATIC
     	${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c

From a1b9b99777496b62f439dd82d4e19a2c667a7d1d Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 18 Apr 2025 11:17:28 +0200
Subject: [PATCH 019/346] ah DXC needs to point to devshFixes_clang_19_1_1

---
 3rdparty/dxc/dxc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 5ab4d368b6..4621c707ed 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 5ab4d368b666d365217c751f5610b496b828ff96
+Subproject commit 4621c707ed774ab8382391f6434810ebecd37111

From 77ed416733f6e337445df4e27b1b62043da47eb7 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 18 Apr 2025 12:56:18 +0200
Subject: [PATCH 020/346] keep designated initializers for ISwapchain's
 SSharedCreationParams, use hlsl::ShaderStage in IDescriptorSetLayout.h

---
 include/nbl/asset/IDescriptorSetLayout.h | 3 ++-
 include/nbl/video/ISwapchain.h           | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h
index ec3c182fdc..a50f267355 100644
--- a/include/nbl/asset/IDescriptorSetLayout.h
+++ b/include/nbl/asset/IDescriptorSetLayout.h
@@ -330,7 +330,8 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase
 				bindings[i].binding = i;
 				bindings[i].type = type;
 				bindings[i].createFlags = SBinding::E_CREATE_FLAGS::ECF_NONE;
-				bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY;
+
+				bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:hlsl::ShaderStage::ESS_ALL_OR_LIBRARY;
 				bindings[i].count = counts ? counts[i]:1u;
 				bindings[i].samplers = nullptr;
 			}
diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h
index 99ba2e7975..882ac16648 100644
--- a/include/nbl/video/ISwapchain.h
+++ b/include/nbl/video/ISwapchain.h
@@ -21,8 +21,6 @@ class ISwapchain : public IBackendObject
 
         struct SSharedCreationParams
         {
-            SSharedCreationParams() {}
-
             inline bool valid(const IPhysicalDevice* physDev, const ISurface* surface) const
             {
                 ISurface::SCapabilities caps;
@@ -456,12 +454,13 @@ class ISwapchain : public IBackendObject
         {
             return params.deduce(getOriginDevice()->getPhysicalDevice(),m_params.surface.get(),{&m_params.sharedParams.presentMode.value,1},{&m_params.sharedParams.compositeAlpha.value,1},{&m_params.sharedParams.preTransform.value,1});
         }
-        inline core::smart_refctd_ptr<ISwapchain> recreate(SSharedCreationParams params={})
+        inline core::smart_refctd_ptr<ISwapchain> recreate(SSharedCreationParams params)
         {
             if (!deduceRecreationParams(params))
                 return nullptr;
             return recreate_impl(std::move(params));
         }
+        inline core::smart_refctd_ptr<ISwapchain> recreate() { return recreate({}); }
 
         // Vulkan: const VkSwapchainKHR*
         virtual const void* getNativeHandle() const = 0;

From a2a7e72f42501fe71b6f4f141b92bac4d2b8cd93 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 18 Apr 2025 13:47:08 +0200
Subject: [PATCH 021/346] for my sanity - add -DNDEBUG to Clang profile (Unix)

---
 cmake/adjust/template/vendor/impl/Clang.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index 4002bc4f65..a8ddfcb6bf 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -101,6 +101,7 @@ else()
 		-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
 		-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
 		-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+		-DNDEBUG
 	)
 
 	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
@@ -108,5 +109,6 @@ else()
 		-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
 		-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
 		-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+		-DNDEBUG
 	)
 endif()
\ No newline at end of file

From cde9e7971a890baf9b82fde753307d38cadf17fe Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Fri, 18 Apr 2025 17:24:28 +0200
Subject: [PATCH 022/346] correct CMAKE_MSVC_DEBUG_INFORMATION_FORMAT logic
 hence make it work without need to specify debug format flags with ClangCL by
 hand, enforce ProgramDatabase regardless the case
 (https://gitlab.kitware.com/cmake/cmake/-/issues/26879#note_1649970)

---
 CMakeLists.txt                                |  8 +-------
 cmake/adjust/flags.cmake                      | 13 +++----------
 cmake/adjust/template/vendor/impl/Clang.cmake |  5 -----
 3 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c5fa8da4a..60c10acfab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,13 +24,7 @@ option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared
 option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON)
 option(NBL_SANITIZE_ADDRESS OFF)
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
-	if(NBL_SANITIZE_ADDRESS)
-		set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>")
-	else()
-		set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug>:EditAndContinue>$<$<CONFIG:RelWithDebInfo>:ProgramDatabase>")
-	endif()
-endif()
+set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT $<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>) # ignored on non xMSVC-ABI targets
 
 if(NBL_STATIC_BUILD)
 	message(STATUS "Static Nabla build enabled!")
diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index d8519aea07..1e67914ae0 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -305,17 +305,10 @@ function(nbl_adjust_flags)
 			
 			set(MAPPED_CONFIG $<TARGET_GENEX_EVAL:${NBL_TARGET_ITEM},$<TARGET_PROPERTY:${NBL_TARGET_ITEM},NBL_CONFIGURATION_MAP>>)
 			
-			if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC)
-				if(NBL_SANITIZE_ADDRESS)
-					set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$<OR:$<STREQUAL:${MAPPED_CONFIG},DEBUG>,$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>>:ProgramDatabase>")
-				else()
-					set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$<STREQUAL:${MAPPED_CONFIG},DEBUG>:EditAndContinue>$<$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>:ProgramDatabase>")
-				endif()	
-			endif()
-			
 			set_target_properties(${NBL_TARGET_ITEM} PROPERTIES
-				MSVC_DEBUG_INFORMATION_FORMAT "${NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT}"
-			)			
+				MSVC_DEBUG_INFORMATION_FORMAT $<$<OR:$<STREQUAL:${MAPPED_CONFIG},DEBUG>,$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>>:ProgramDatabase> # ignored on non xMSVC-ABI targets
+			)
+
 			math(EXPR _NBL_ARG_I_ "${_NBL_ARG_I_} + 1")
 		endwhile()		
 	else() # DIRECTORY mode
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
index a8ddfcb6bf..0b00294411 100644
--- a/cmake/adjust/template/vendor/impl/Clang.cmake
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -68,11 +68,6 @@ endif()
 if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES MSVC)
 	# ClangCL with MSVC frontend (most of the options are compatible but eg /arch:SSE4.2 seems to be not)
 	include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake")
-
-	# https://cmake.org/cmake/help/latest/variable/CMAKE_MSVC_DEBUG_INFORMATION_FORMAT.html
-	# should be set with CMAKE_MSVC_DEBUG_INFORMATION_FORMAT but for some reason it doesn't respect with ClangCL even though its MSVC frontend
-	# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG RELWITHDEBINFO COMPILE_OPTIONS /Zi)
 	return()
 else()
 	if(NBL_REQUEST_SSE_AVX2)

From 6e4392e7e6e0e87a990fc7d65677e94c41ae9ef3 Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Tue, 22 Apr 2025 10:35:39 +0200
Subject: [PATCH 023/346] add more NEW policies, leave TODO comment for
 `cmake_policy`

---
 CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60c10acfab..a63d30a89d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,12 @@
 # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h
 
 cmake_minimum_required(VERSION 3.31)
-cmake_policy(SET CMP0112 NEW)
+# TODO: Yas - once we deploy 4.x we will fire `cmake_policy` instead of manually picking policies
+# https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html#policy-version
+# also we should update deps which throw warnings about < 3.10 compatibility
+cmake_policy(SET CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077
+cmake_policy(SET CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077
+cmake_policy(SET CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112
 cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141
 cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118
 

From dc41722b4ab0da664239f1157a9606a6c6ada868 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 28 Apr 2025 19:59:15 +0700
Subject: [PATCH 024/346] Implement mutable shader spec info

---
 include/nbl/asset/ICPUComputePipeline.h  |  48 +++++++---
 include/nbl/asset/ICPUGraphicsPipeline.h | 106 ++++++++++++++++-------
 include/nbl/asset/ICPUPipeline.h         | 102 +---------------------
 include/nbl/asset/IGraphicsPipeline.h    |   4 +-
 include/nbl/asset/IPipeline.h            |  26 ++++--
 5 files changed, 135 insertions(+), 151 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index b9b707d9fc..704c4c05fc 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -12,20 +12,21 @@ namespace nbl::asset
 {
 
 //! CPU Version of Compute Pipeline
-class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>
+class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
 {
-        using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>;
+        using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>>;
 
     public:
         struct SCreationParams final : IPipeline<ICPUPipelineLayout>::SCreationParams
         {
-            SShaderSpecInfo shader;
+            IPipelineBase::SShaderSpecInfo<true> shader;
         };
         static core::smart_refctd_ptr<ICPUComputePipeline> create(const SCreationParams& params)
         {
             if (!params.layout)
                 return nullptr;
             auto retval = new ICPUComputePipeline(core::smart_refctd_ptr<const ICPUPipelineLayout>(params.layout));
+            
             if (!retval->setSpecInfo(params.shader))
             {
                 retval->drop();
@@ -34,35 +35,54 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>
             return core::smart_refctd_ptr<ICPUComputePipeline>(retval,core::dont_grab);
         }
 
+        inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
+        {
+            core::smart_refctd_ptr<ICPUPipelineLayout> layout;
+            if (_depth>0u && m_layout)
+                layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(m_layout->clone(_depth-1u));
+
+            auto cp = new ICPUComputePipeline(std::move(layout));
+            if (m_specInfo.shader)
+            {
+                SShaderSpecInfo<true> specInfo = m_specInfo;
+                if (_depth > 0u)
+                {
+                  specInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(m_specInfo.shader->clone(_depth - 1u));
+                }
+                cp->setSpecInfo(specInfo);
+            }
+            return core::smart_refctd_ptr<ICPUComputePipeline>(cp,core::dont_grab);
+        }
+
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
 		//!
 		inline size_t getDependantCount() const override {return 2;}
 
-        // provide default arg
-        inline IPipelineBase::SShaderSpecInfo getSpecInfo() const {return base_t::getSpecInfo(hlsl::ShaderStage::ESS_COMPUTE);}
-
     protected:
         using base_t::base_t;
         virtual ~ICPUComputePipeline() = default;
 
-        base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const override 
-        {
-            return new ICPUComputePipeline(std::move(layout));
-        }
-        
 		inline IAsset* getDependant_impl(const size_t ix) override
         {
             if (ix!=0)
-                return m_stages[0].shader.get();
+                return m_specInfo.shader.get();
             return const_cast<ICPUPipelineLayout*>(m_layout.get());
         }
 
-        inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override
+        inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo<true>& info)
         {
-            return stage!=hlsl::ShaderStage::ESS_COMPUTE ? (-1):0;
+          const auto specSize = info.valid();
+          if (specSize < 0) return false;
+          if (info.stage != hlsl::ESS_COMPUTE) return false;
+          m_specInfo = info;
+          return true;
         }
+
+    private:
+        SShaderSpecInfo<true> m_specInfo;
+
 };
 
 }
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 2643db7550..b624d53aa9 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -13,10 +13,10 @@
 namespace nbl::asset
 {
 
-class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPipelineLayout,ICPURenderpass>,5u>
+class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<IPipelineBase::SShaderSpecInfo<true>, ICPUPipelineLayout,ICPURenderpass>>
 {
-        using pipeline_base_t = IGraphicsPipeline<ICPUPipelineLayout,ICPURenderpass>;
-        using base_t = ICPUPipeline<pipeline_base_t,5u>;
+        using pipeline_base_t = IGraphicsPipeline<SShaderSpecInfo<true>,ICPUPipelineLayout, ICPURenderpass>;
+        using base_t = ICPUPipeline<pipeline_base_t>;
 
     public:
 		struct SCreationParams final : pipeline_base_t::SCreationParams
@@ -29,27 +29,65 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 					return pipeline_base_t::SCreationParams::impl_valid(std::move(extra));
 				}
 		};
+
 		static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(const SCreationParams& params)
 		{
 			// we'll validate the specialization info later when attempting to set it
-            if (!params.impl_valid([](const IPipelineBase::SShaderSpecInfo& info)->bool{return true;}))
-                return nullptr;
-            auto retval = new ICPUGraphicsPipeline(params);
-            for (const auto spec : params.shaders)
-            if (spec.shader)
-				retval->setSpecInfo(spec);
-            return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
+        if (!params.impl_valid([](const SShaderSpecInfo<true>& info)->bool{return true;}))
+            return nullptr;
+        auto retval = new ICPUGraphicsPipeline(params);
+        for (const auto spec : params.shaders)
+        {
+            if (spec.shader) retval->setSpecInfo(spec);
+        }
+        return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
 		}
 
+    inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
+    {
+        core::smart_refctd_ptr<ICPUPipelineLayout> layout;
+        if (_depth>0u && m_layout) 
+					layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(m_layout->clone(_depth-1u));
+
+				auto* cp = [&] {
+            std::array<SShaderSpecInfo<true>, GRAPHICS_SHADER_STAGE_COUNT> _shaders;
+            for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++)
+              _shaders[i] = m_specInfos[i];
+            const SCreationParams params = { {
+              .shaders = _shaders,
+              .cached = m_params,
+              .renderpass = m_renderpass.get()
+            } };
+            return new ICPUGraphicsPipeline(params);
+				}();
+				for (auto specInfo : m_specInfos)
+				{
+            if (specInfo.shader)
+            {
+                auto newSpecInfo = specInfo;
+                if (_depth>0u)
+                {
+                    newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(specInfo.shader->clone(_depth-1u));
+                }
+                cp->setSpecInfo(newSpecInfo);
+            }
+				}
+
+        return core::smart_refctd_ptr<ICPUGraphicsPipeline>(cp,core::dont_grab);
+    }
+
+
 		constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 		
 		inline size_t getDependantCount() const override
 		{
 			auto stageCount = 2; // the layout and renderpass
-			for (const auto& stage : m_stages)
-			if (stage.shader)
-				stageCount++;
+			for (const auto& info : m_specInfos)
+			{
+        if (info.shader)
+          stageCount++;
+			}
 			return stageCount;
 		}
 
@@ -65,18 +103,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 		using base_t::base_t;
         ~ICPUGraphicsPipeline() = default;
 
-		base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const override
-		{
-			std::array<IPipelineBase::SShaderSpecInfo,GRAPHICS_SHADER_STAGE_COUNT> _shaders;
-			for (auto i=0; i<GRAPHICS_SHADER_STAGE_COUNT; i++)
-				_shaders[i] = m_stages[i].info;
-			const SCreationParams params = {{
-				.shaders = _shaders,
-				.cached = m_params,
-				.renderpass = m_renderpass.get()
-			}};
-			return new ICPUGraphicsPipeline(params);
-		}
 		inline IAsset* getDependant_impl(const size_t ix) override
 		{
 			if (ix==0)
@@ -84,20 +110,38 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 			if (ix==1)
 				return m_renderpass.get();
 			size_t stageCount = 0;
-			for (auto& stage : m_stages)
-			if (stage.shader)
-			if ((stageCount++)==ix-2)
-				return stage.shader.get();
+			for (auto& specInfo : m_specInfos)
+			{
+        if (specInfo.shader)
+        {
+          if ((stageCount++)==ix-2)
+            return specInfo.shader.get();
+        }
+			}
 			return nullptr;
 		}
 
-		inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override
+		inline int8_t stageToIndex(const hlsl::ShaderStage stage) const
 		{
 			const auto stageIx = hlsl::findLSB(stage);
-			if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
+			if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
 				return -1;
 			return stageIx;
 		}
+
+    inline bool setSpecInfo(const SShaderSpecInfo<true>& info)
+		{
+			assert(isMutable());
+      const auto specSize = info.valid();
+      if (specSize<0) return false;
+			const auto stage = info.stage;
+			const auto stageIx = stageToIndex(stage);
+			if (stageIx<0) return false;
+			m_specInfos[stageIx] = info;
+			return true;
+		}
+
+		SShaderSpecInfo<true> m_specInfos[GRAPHICS_SHADER_STAGE_COUNT];
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index d1693f18eb..eb634d3f12 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -14,37 +14,13 @@ namespace nbl::asset
 {
 
 // Common Base class for pipelines
-template<typename PipelineNonAssetBase, uint8_t MaxShaderStageCount>
+template<typename PipelineNonAssetBase>
 class ICPUPipeline : public IAsset, public PipelineNonAssetBase
 {
-        using this_t = ICPUPipeline<PipelineNonAssetBase,MaxShaderStageCount>;
+        using this_t = ICPUPipeline<PipelineNonAssetBase>;
+        using shader_info_spec_t = IPipelineBase::SShaderSpecInfo<true>;
 
     public:
-        inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
-        {
-            core::smart_refctd_ptr<ICPUPipelineLayout> layout;
-            if (_depth>0u && PipelineNonAssetBase::m_layout)
-				layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(PipelineNonAssetBase::m_layout->clone(_depth-1u));
-
-            auto cp = clone_impl(std::move(layout));
-            for (auto i=0; i<MaxShaderStageCount; i++)
-            {
-                const auto shader = m_stages[i].shader;
-                if (shader)
-                {
-                    auto stageInfo = m_stages[i].info;
-                    core::smart_refctd_ptr<IShader> newShader;
-                    if (_depth>0u)
-                    {
-                        newShader = core::smart_refctd_ptr_static_cast<IShader>(shader->clone(_depth-1u));
-                        stageInfo.shader = newShader.get();
-                    }
-                    cp->setSpecInfo(stageInfo);
-                }
-            }
-
-            return core::smart_refctd_ptr<this_t>(cp,core::dont_grab);
-        }
 
         // extras for this class
         ICPUPipelineLayout* getLayout() 
@@ -60,82 +36,10 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase
             PipelineNonAssetBase::m_layout = std::move(_layout);
         }
 
-        // The getters are weird because the shader pointer, spec constant map and entry point needs patching
-        inline IShader* getShader(const hlsl::ShaderStage stage)
-        {
-            assert(isMutable());
-            return const_cast<IShader*>(getSpecInfo(stage).shader);
-        }
-		inline std::string* getEntryPoint(const hlsl::ShaderStage stage)
-		{
-			const auto stageIx = stageToIndex(stage);
-            if (stageIx<0)
-                return {};
-			return &m_stages[stageIx].entryPoint;
-		}
-        inline IPipelineBase::SShaderSpecInfo::spec_constant_map_t* getSpecConstantMap(const hlsl::ShaderStage stage)
-        {
-            assert(isMutable());
-            return const_cast<IPipelineBase::SShaderSpecInfo::spec_constant_map_t*>(getSpecInfo(stage).entries);
-        }
-        //
-		inline IPipelineBase::SShaderSpecInfo getSpecInfo(const hlsl::ShaderStage stage) const
-		{
-			const auto stageIx = stageToIndex(stage);
-            if (stageIx<0)
-                return {};
-			return m_stages[stageIx].info;
-		}
-		inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo& info)
-		{
-			assert(isMutable());
-            const int64_t specSize = info.valid();
-            if (specSize<0)
-                return false;
-			const auto stageIx = stageToIndex(info.stage);
-			if (stageIx<0)
-				return false;
-            auto& outStage = m_stages[stageIx];
-			outStage.info = info;
-            outStage.entryPoint = info.entryPoint;
-			outStage.shader = core::smart_refctd_ptr<IShader>(const_cast<IShader*>(info.shader));
-			outStage.info.shader = outStage.shader.get();
-            auto& outEntries = outStage.entries;
-            if (specSize>0)
-            {
-                outEntries = std::make_unique<IPipelineBase::SShaderSpecInfo::spec_constant_map_t>();
-                outEntries->reserve(info.entries->size());
-                std::copy(info.entries->begin(),info.entries->end(),std::insert_iterator(*outEntries,outEntries->begin()));
-            }
-            else
-                outEntries = nullptr;
-			outStage.info.entries = outEntries.get();
-			return true;
-		}
-        inline bool clearStage(const hlsl::ShaderStage stage)
-        {
-            assert(isMutable());
-            const auto stageIx = stageToIndex(stage);
-            if (stageIx<0)
-                return false;
-            m_stages[stageIx] = {};
-            return true;
-        }
-
     protected:
         using PipelineNonAssetBase::PipelineNonAssetBase;
         virtual ~ICPUPipeline() = default;
 
-        virtual this_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const = 0;
-        virtual int8_t stageToIndex(const hlsl::ShaderStage stage) const = 0;
-
-        struct ShaderStage
-        {
-            std::string entryPoint = {};
-            core::smart_refctd_ptr<IShader> shader = {};
-            std::unique_ptr<IPipelineBase::SShaderSpecInfo::spec_constant_map_t> entries = {};
-            IPipelineBase::SShaderSpecInfo info = {};
-        } m_stages[MaxShaderStageCount] = {};
 };
 
 }
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index c59ad51ca9..1f3bec79a1 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -81,7 +81,7 @@ class IGraphicsPipelineBase : public virtual core::IReferenceCounted
         };
 };
 
-template<typename PipelineLayoutType, typename RenderpassType>
+template<typename SpecInfoType, typename PipelineLayoutType, typename RenderpassType>
 class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphicsPipelineBase
 {
     protected:
@@ -91,7 +91,7 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
         struct SCreationParams : IPipeline<PipelineLayoutType>::SCreationParams
         {
             protected:
-                using SpecInfo = IPipelineBase::SShaderSpecInfo;
+                using SpecInfo = SpecInfoType;
                 template<typename ExtraLambda>
                 inline bool impl_valid(ExtraLambda&& extra) const
                 {
diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index 036a684729..8ecb2f0fb3 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -132,8 +132,10 @@ class IPipelineBase
 			Without Specialization Constants, you would have to commit
 			to a final value before the SPIR-V compilation
 		*/
+		template <bool IsMutable = false>
 		struct SShaderSpecInfo final
 		{
+
 			//! Structure specifying a specialization map entry
 			/*
 				Note that if specialization constant ID is used
@@ -146,7 +148,7 @@ class IPipelineBase
 			*/
 			//!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
 			using spec_constant_id_t = uint32_t;
-			struct SSpecConstantValue
+			struct SSpecConstantValueImmutable
 			{
 				const void* data = nullptr;
 				//!< The byte size of the specialization constant value within the supplied data buffer.
@@ -154,8 +156,18 @@ class IPipelineBase
 
 				inline operator bool() const {return data&&size;}
 				
-				auto operator<=>(const SSpecConstantValue&) const = default;
+				auto operator<=>(const SSpecConstantValueImmutable&) const = default;
+			};
+
+			struct SSPecConstantValueMutable
+			{
+				core::vector<uint8_t> data;
+				inline operator bool() const { return data.size(); }
+				auto operator<=>(const SSPecConstantValueMutable&) const = default;
 			};
+
+			using SSpecConstantValue = std::conditional_t<IsMutable, SSPecConstantValueMutable, SSpecConstantValueImmutable>;
+
 			inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const
 			{
 				if (!entries)
@@ -231,11 +243,14 @@ class IPipelineBase
 				return static_cast<int32_t>(specData);
 			}
 
+			using shader_ptr_t = std::conditional_t<IsMutable, core::smart_refctd_ptr<IShader>, const IShader*>;
+			using entry_point_t = std::conditional_t<IsMutable, std::string, std::string_view>;
 			using spec_constant_map_t = core::unordered_map<spec_constant_id_t,SSpecConstantValue>;
+			using entries_t = std::conditional_t<IsMutable, spec_constant_map_t, const spec_constant_map_t*>;
 
-			const IShader* shader = nullptr;
+			shader_ptr_t shader = nullptr;
 			// A name of the function where the entry point of an shader executable begins. It's often "main" function.
-			std::string_view entryPoint = {};
+			entry_point_t entryPoint = {};
 			// stage must be set
 			hlsl::ShaderStage stage = hlsl::ShaderStage::ESS_UNKNOWN;
 			// there's some padding here
@@ -244,12 +259,13 @@ class IPipelineBase
 			uint8_t requireFullSubgroups : 1 = false;
 			// Container choice implicitly satisfies:
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
-			const spec_constant_map_t* entries = nullptr;
+			entries_t entries = nullptr;
 			// By requiring Nabla Core Profile features we implicitly satisfy:
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
 			// Also because our API is sane, it satisfies the following by construction:
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
+
 		};
 };
 template<typename PipelineLayout>

From 7fe3431366e436dc63fa1795afd324ba99cf473d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 28 Apr 2025 20:14:37 +0700
Subject: [PATCH 025/346] Rework IGPUGraphicsPipeline

---
 include/nbl/asset/IGraphicsPipeline.h    | 6 +++---
 include/nbl/video/IGPUGraphicsPipeline.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index 1f3bec79a1..77f220aa78 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -86,12 +86,12 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
 {
     protected:
         using renderpass_t = RenderpassType;
+        using spec_info_t = SpecInfoType;
 
     public:
         struct SCreationParams : IPipeline<PipelineLayoutType>::SCreationParams
         {
             protected:
-                using SpecInfo = SpecInfoType;
                 template<typename ExtraLambda>
                 inline bool impl_valid(ExtraLambda&& extra) const
                 {
@@ -136,7 +136,7 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
             public:
                 inline bool valid() const
                 {
-                    return impl_valid([](const SpecInfo& info)->bool
+                    return impl_valid([](const spec_info_t& info)->bool
                     {
                         if (!info.valid())
                             return false;
@@ -144,7 +144,7 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
                     });
                 }
 
-                std::span<const SpecInfo> shaders = {};
+                std::span<const spec_info_t> shaders = {};
                 SCachedCreationParams cached = {};
                 renderpass_t* renderpass = nullptr;
         };
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 8240bcea94..4838d7f4d3 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -11,9 +11,9 @@
 namespace nbl::video
 {
 
-class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline<const IGPUPipelineLayout,const IGPURenderpass>
+class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline<asset::IPipelineBase::SShaderSpecInfo<false>, const IGPUPipelineLayout,const IGPURenderpass>
 {
-        using pipeline_t = asset::IGraphicsPipeline<const IGPUPipelineLayout,const IGPURenderpass>;
+        using pipeline_t = asset::IGraphicsPipeline<asset::IPipelineBase::SShaderSpecInfo<false>, const IGPUPipelineLayout,const IGPURenderpass>;
 
     public:
 		struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPUGraphicsPipeline>
@@ -36,7 +36,7 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel
                 if (!layout)
                     return {};
                 SSpecializationValidationResult retval = {.count=0,.dataSize=0};
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const IPipelineBase::SShaderSpecInfo& info)->bool
+                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool
                 {
                     const auto dataSize = info.valid();
                     if (dataSize<0)

From 8dbe9c782ef1f0ac7f154d9c04598f740c061d8f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 28 Apr 2025 20:14:51 +0700
Subject: [PATCH 026/346] Rework IGPUComputePipeline.h

---
 include/nbl/video/IGPUComputePipeline.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 49e44dfcc1..ba29cc58e2 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -17,6 +17,7 @@ namespace nbl::video
 class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const IGPUPipelineLayout>
 {
         using pipeline_t = asset::IPipeline<const IGPUPipelineLayout>;
+        using spec_info_t = SShaderSpecInfo<false>;
 
     public:
         struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPUComputePipeline>
@@ -63,11 +64,11 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
                 return {.count=dataSize ? count:0,.dataSize=static_cast<uint32_t>(dataSize)};
             }
 
-            inline std::span<const IPipelineBase::SShaderSpecInfo> getShaders() const {return {&shader,1}; }
+            inline std::span<const spec_info_t> getShaders() const {return {&shader,1}; }
 
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
-            IPipelineBase::SShaderSpecInfo shader = {};
+            spec_info_t shader = {};
         };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const {return m_flags;}

From 436e6e16e51f1abf19c2c599834ed9a7fa1f0d38 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 29 Apr 2025 15:09:19 +0700
Subject: [PATCH 027/346] Remove default value for mutable template parameter

---
 include/nbl/asset/IPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index 8ecb2f0fb3..97d7ab9c94 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -132,7 +132,7 @@ class IPipelineBase
 			Without Specialization Constants, you would have to commit
 			to a final value before the SPIR-V compilation
 		*/
-		template <bool IsMutable = false>
+		template <bool IsMutable>
 		struct SShaderSpecInfo final
 		{
 

From 2e367d12519c473135785b689700b7ce62457104 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 5 May 2025 17:00:39 +0700
Subject: [PATCH 028/346] Implement IGPUPipeline and refactor SCreationParams

---
 include/nbl/asset/ICPUComputePipeline.h     |  67 ++--
 include/nbl/asset/ICPUGraphicsPipeline.h    | 183 +++++-----
 include/nbl/asset/ICPUPipeline.h            | 111 ++++++-
 include/nbl/asset/IGraphicsPipeline.h       |  74 +----
 include/nbl/asset/IPipeline.h               | 349 ++++++--------------
 include/nbl/video/IGPUComputePipeline.h     |  20 +-
 include/nbl/video/IGPUGraphicsPipeline.h    |  68 +++-
 include/nbl/video/IGPUPipeline.h            | 110 ++++++
 include/nbl/video/IGPURayTracingPipeline.h  |  32 +-
 include/nbl/video/SPipelineCreationParams.h |   2 +-
 10 files changed, 518 insertions(+), 498 deletions(-)
 create mode 100644 include/nbl/video/IGPUPipeline.h

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 704c4c05fc..d9bc8dd646 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -17,68 +17,53 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
         using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>>;
 
     public:
-        struct SCreationParams final : IPipeline<ICPUPipelineLayout>::SCreationParams
-        {
-            IPipelineBase::SShaderSpecInfo<true> shader;
-        };
-        static core::smart_refctd_ptr<ICPUComputePipeline> create(const SCreationParams& params)
+        explicit ICPUComputePipeline(const ICPUPipelineLayout* layout):
+          base_t(core::smart_refctd_ptr<ICPUPipelineLayout>(layout))
+          {}
+
+        static core::smart_refctd_ptr<ICPUComputePipeline> create(const ICPUPipelineLayout* layout)
         {
-            if (!params.layout)
-                return nullptr;
-            auto retval = new ICPUComputePipeline(core::smart_refctd_ptr<const ICPUPipelineLayout>(params.layout));
-            
-            if (!retval->setSpecInfo(params.shader))
-            {
-                retval->drop();
-                return nullptr;
-            }
+            auto retval = new ICPUComputePipeline(layout);
             return core::smart_refctd_ptr<ICPUComputePipeline>(retval,core::dont_grab);
         }
 
-        inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
+        inline base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
-            core::smart_refctd_ptr<ICPUPipelineLayout> layout;
-            if (_depth>0u && m_layout)
-                layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(m_layout->clone(_depth-1u));
-
-            auto cp = new ICPUComputePipeline(std::move(layout));
-            if (m_specInfo.shader)
-            {
-                SShaderSpecInfo<true> specInfo = m_specInfo;
-                if (_depth > 0u)
-                {
-                  specInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(m_specInfo.shader->clone(_depth - 1u));
-                }
-                cp->setSpecInfo(specInfo);
-            }
-            return core::smart_refctd_ptr<ICPUComputePipeline>(cp,core::dont_grab);
+            auto newPipeline = new ICPUComputePipeline(std::move(layout));
+            newPipeline->m_specInfo = newPipeline->cloneSpecInfo(m_specInfo, depth);
+            return newPipeline;
         }
 
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-		//!
-		inline size_t getDependantCount() const override {return 2;}
+        //!
+        inline size_t getDependantCount() const override { return 2; }
+
+        inline virtual std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) override final
+        {
+            if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable())
+                return {m_specInfo,1};
+            return {};
+        }
+
+        inline virtual bool valid() const override final
+        {
+            // TODO(kevinyu): Fix this temporary dummy code
+            return true;
+        }
 
     protected:
         using base_t::base_t;
         virtual ~ICPUComputePipeline() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
+        inline IAsset* getDependant_impl(const size_t ix) override
         {
             if (ix!=0)
                 return m_specInfo.shader.get();
             return const_cast<ICPUPipelineLayout*>(m_layout.get());
         }
 
-        inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo<true>& info)
-        {
-          const auto specSize = info.valid();
-          if (specSize < 0) return false;
-          if (info.stage != hlsl::ESS_COMPUTE) return false;
-          m_specInfo = info;
-          return true;
-        }
 
     private:
         SShaderSpecInfo<true> m_specInfo;
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index b624d53aa9..b93b8165aa 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -13,135 +13,102 @@
 namespace nbl::asset
 {
 
-class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<IPipelineBase::SShaderSpecInfo<true>, ICPUPipelineLayout,ICPURenderpass>>
+class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPipelineLayout,ICPURenderpass>>
 {
-        using pipeline_base_t = IGraphicsPipeline<SShaderSpecInfo<true>,ICPUPipelineLayout, ICPURenderpass>;
+        using pipeline_base_t = IGraphicsPipeline<ICPUPipelineLayout, ICPURenderpass>;
         using base_t = ICPUPipeline<pipeline_base_t>;
 
     public:
-		struct SCreationParams final : pipeline_base_t::SCreationParams
-		{
-			private:
-				friend class ICPUGraphicsPipeline;
-				template<typename ExtraLambda>
-				inline bool impl_valid(ExtraLambda&& extra) const
-				{
-					return pipeline_base_t::SCreationParams::impl_valid(std::move(extra));
-				}
-		};
-
-		static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(const SCreationParams& params)
-		{
-			// we'll validate the specialization info later when attempting to set it
-        if (!params.impl_valid([](const SShaderSpecInfo<true>& info)->bool{return true;}))
-            return nullptr;
-        auto retval = new ICPUGraphicsPipeline(params);
-        for (const auto spec : params.shaders)
+        explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout)
+            : base_t(layout, {}, {})
+            {}
+        
+        static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(const ICPUPipelineLayout* layout)
         {
-            if (spec.shader) retval->setSpecInfo(spec);
+            auto retval = new ICPUGraphicsPipeline(layout);
+            return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
         }
-        return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
-		}
-
-    inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
-    {
-        core::smart_refctd_ptr<ICPUPipelineLayout> layout;
-        if (_depth>0u && m_layout) 
-					layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(m_layout->clone(_depth-1u));
 
-				auto* cp = [&] {
-            std::array<SShaderSpecInfo<true>, GRAPHICS_SHADER_STAGE_COUNT> _shaders;
+        inline base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
             for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++)
-              _shaders[i] = m_specInfos[i];
-            const SCreationParams params = { {
-              .shaders = _shaders,
-              .cached = m_params,
-              .renderpass = m_renderpass.get()
-            } };
-            return new ICPUGraphicsPipeline(params);
-				}();
-				for (auto specInfo : m_specInfos)
-				{
-            if (specInfo.shader)
+                newPipeline->m_specInfos[i] = m_specInfos[i];
+            newPipeline->m_params = m_params;
+            newPipeline->m_renderpass = m_renderpass;
+            
+            for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
             {
-                auto newSpecInfo = specInfo;
-                if (_depth>0u)
-                {
-                    newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(specInfo.shader->clone(_depth-1u));
-                }
-                cp->setSpecInfo(newSpecInfo);
+                newPipeline->m_specInfos[specInfo_i] = newPipeline->cloneSpecInfo(m_specInfos[specInfo_i], depth);
             }
-				}
-
-        return core::smart_refctd_ptr<ICPUGraphicsPipeline>(cp,core::dont_grab);
-    }
-
-
-		constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
-		inline E_TYPE getAssetType() const override { return AssetType; }
-		
-		inline size_t getDependantCount() const override
-		{
-			auto stageCount = 2; // the layout and renderpass
-			for (const auto& info : m_specInfos)
-			{
-        if (info.shader)
-          stageCount++;
-			}
-			return stageCount;
-		}
-
-		// extras for this class
-		inline const SCachedCreationParams& getCachedCreationParams() const {return base_t::getCachedCreationParams();}
+
+            return newPipeline;
+        }
+
+        constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
+        inline E_TYPE getAssetType() const override { return AssetType; }
+        
+        inline size_t getDependantCount() const override
+        {
+            auto stageCount = 2; // the layout and renderpass
+            for (const auto& info : m_specInfos)
+            {
+              if (info.shader)
+                stageCount++;
+            }
+            return stageCount;
+        }
+
         inline SCachedCreationParams& getCachedCreationParams()
         {
             assert(isMutable());
             return m_params;
         }
 
+        inline virtual std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) override final
+        {
+            const auto stageIndex = stageToIndex(stage);
+            if (isMutable() && stageIndex != -1)
+            {
+                return { &m_specInfos[stageIndex], 1 };
+            }
+            return {};
+        }
+
+        inline virtual bool valid() const override final
+        {
+            // TODO(kevinyu): Fix this temporary stub code
+            return true;
+        }
+
     protected:
-		using base_t::base_t;
+        using base_t::base_t;
         ~ICPUGraphicsPipeline() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			if (ix==0)
-				return const_cast<ICPUPipelineLayout*>(m_layout.get());
-			if (ix==1)
-				return m_renderpass.get();
-			size_t stageCount = 0;
-			for (auto& specInfo : m_specInfos)
-			{
-        if (specInfo.shader)
+        inline IAsset* getDependant_impl(const size_t ix) override
         {
-          if ((stageCount++)==ix-2)
-            return specInfo.shader.get();
+            if (ix==0)
+                return const_cast<ICPUPipelineLayout*>(m_layout.get());
+            if (ix==1)
+                return m_renderpass.get();
+            size_t stageCount = 0;
+            for (auto& specInfo : m_specInfos)
+            {
+                if (specInfo.shader)
+                    if ((stageCount++)==ix-2) return specInfo.shader.get();
+            }
+            return nullptr;
+        }
+
+        inline int8_t stageToIndex(const hlsl::ShaderStage stage) const
+        {
+            const auto stageIx = hlsl::findLSB(stage);
+            if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
+              return -1;
+            return stageIx;
         }
-			}
-			return nullptr;
-		}
-
-		inline int8_t stageToIndex(const hlsl::ShaderStage stage) const
-		{
-			const auto stageIx = hlsl::findLSB(stage);
-			if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
-				return -1;
-			return stageIx;
-		}
-
-    inline bool setSpecInfo(const SShaderSpecInfo<true>& info)
-		{
-			assert(isMutable());
-      const auto specSize = info.valid();
-      if (specSize<0) return false;
-			const auto stage = info.stage;
-			const auto stageIx = stageToIndex(stage);
-			if (stageIx<0) return false;
-			m_specInfos[stageIx] = info;
-			return true;
-		}
-
-		SShaderSpecInfo<true> m_specInfos[GRAPHICS_SHADER_STAGE_COUNT];
+
+        std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index eb634d3f12..623d5ae2df 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -13,12 +13,95 @@
 namespace nbl::asset
 {
 
+class ICPUPipelineBase
+{
+    public:
+        struct SShaderSpecInfo
+        {
+            //! Structure specifying a specialization map entry
+            /*
+              Note that if specialization constant ID is used
+              in a shader, \bsize\b and \boffset'b must match
+              to \isuch an ID\i accordingly.
+
+              By design the API satisfies:
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774
+            */
+            //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
+            using spec_constant_id_t = uint32_t;
+
+            struct SSpecConstantValue
+            {
+                core::vector<uint8_t> data;
+                inline operator bool() const { return data.size(); }
+                inline size_t size() const { return data.size(); }
+            };
+
+            inline SSpecConstantValue* getSpecializationByteValue(const spec_constant_id_t _specConstID)
+            {
+                const auto found = entries.find(_specConstID);
+                if (found != entries.end() && bool(found->second)) return &found->second;
+                else return nullptr;
+            }
+
+            static constexpr int32_t INVALID_SPEC_INFO = -1;
+            inline int32_t valid() const
+            {
+                if (!shader) return INVALID_SPEC_INFO;
+
+                // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707
+                if (entryPoint.empty()) return INVALID_SPEC_INFO;
+
+                // Impossible to efficiently check anything from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686
+                // and from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
+
+                int64_t specData = 0;
+                for (const auto& entry : entries)
+                {
+                    if (!entry.second) return INVALID_SPEC_INFO;
+                    specData += entry.second.size();
+                }
+                if (specData > 0x7fffffff) return INVALID_SPEC_INFO;
+                return static_cast<int32_t>(specData);
+            }
+
+            core::smart_refctd_ptr<IShader> shader = nullptr;
+            std::string entryPoint = "";
+            IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+            uint8_t requireFullSubgroups : 1 = false;
+
+            // Container choice implicitly satisfies:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
+            core::unordered_map<spec_constant_id_t, SSpecConstantValue> entries;
+            // By requiring Nabla Core Profile features we implicitly satisfy:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
+            // Also because our API is sane, it satisfies the following by construction:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
+
+        };
+
+        virtual std::span<SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) = 0;
+        inline std::span<const SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) const
+        {
+            return getSpecInfo(stage);
+        }
+
+        virtual bool valid() const = 0;
+};
+
 // Common Base class for pipelines
 template<typename PipelineNonAssetBase>
-class ICPUPipeline : public IAsset, public PipelineNonAssetBase
+class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipelineBase
 {
         using this_t = ICPUPipeline<PipelineNonAssetBase>;
-        using shader_info_spec_t = IPipelineBase::SShaderSpecInfo<true>;
 
     public:
 
@@ -36,9 +119,33 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase
             PipelineNonAssetBase::m_layout = std::move(_layout);
         }
 
+        inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
+        {
+            core::smart_refctd_ptr<ICPUPipelineLayout> layout;
+            if (_depth>0u && getLayout()) 
+              layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(getLayout->clone(_depth-1u));
+
+            auto* newPipeline = clone_impl(std::move(layout), _depth);
+
+            return core::smart_refctd_ptr<this_t>(newPipeline,core::dont_grab);
+        }
+
+        SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth)
+        {
+            auto newSpecInfo = specInfo;
+            if (depth>0u)
+            {
+                newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(specInfo.shader->clone(depth - 1u));
+            }
+            return newSpecInfo;
+        }
+
     protected:
+
         using PipelineNonAssetBase::PipelineNonAssetBase;
         virtual ~ICPUPipeline() = default;
+        
+        virtual this_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const = 0;
 
 };
 
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index 77f220aa78..3e029e76b2 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -81,85 +81,23 @@ class IGraphicsPipelineBase : public virtual core::IReferenceCounted
         };
 };
 
-template<typename SpecInfoType, typename PipelineLayoutType, typename RenderpassType>
+template<typename PipelineLayoutType, typename RenderpassType>
 class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphicsPipelineBase
 {
     protected:
         using renderpass_t = RenderpassType;
-        using spec_info_t = SpecInfoType;
 
     public:
-        struct SCreationParams : IPipeline<PipelineLayoutType>::SCreationParams
-        {
-            protected:
-                template<typename ExtraLambda>
-                inline bool impl_valid(ExtraLambda&& extra) const
-                {
-                    if (!IPipeline<PipelineLayoutType>::SCreationParams::layout)
-                        return false;
-
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
-                    if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount())
-                        return false;
-
-                    // TODO: check rasterization samples, etc.
-                    //rp->getCreationParameters().subpasses[i]
-
-                    core::bitflag<hlsl::ShaderStage> stagePresence = {};
-                    for (const auto info : shaders)
-                    if (info.shader)
-                    {
-                        if (!extra(info))
-                            return false;
-                        const auto stage = info.stage;
-                        if (stage>hlsl::ShaderStage::ESS_FRAGMENT)
-                            return false;
-                        if (stagePresence.hasFlags(stage))
-                            return false;
-                        stagePresence |= stage;
-                    }
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
-                    if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
-                        return false;
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730
-                    if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION))
-                        return false;
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889
-                    if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==EPT_PATCH_LIST))
-                        return false;
-                    
-                    return true;
-                }
-
-            public:
-                inline bool valid() const
-                {
-                    return impl_valid([](const spec_info_t& info)->bool
-                    {
-                        if (!info.valid())
-                            return false;
-                        return false;
-                    });
-                }
-
-                std::span<const spec_info_t> shaders = {};
-                SCachedCreationParams cached = {};
-                renderpass_t* renderpass = nullptr;
-        };
-
         inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;}
-
         inline const renderpass_t* getRenderpass() const {return m_renderpass.get();}
 
     protected:
-        explicit IGraphicsPipeline(const SCreationParams& _params) :
-            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(_params.layout)),
-            m_params(_params.cached), m_renderpass(core::smart_refctd_ptr<renderpass_t>(_params.renderpass)) {}
+        explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) :
+            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
+        {}
 
-        SCachedCreationParams m_params;
-        core::smart_refctd_ptr<renderpass_t> m_renderpass;
+        SCachedCreationParams m_params = {};
+        core::smart_refctd_ptr<renderpass_t> m_renderpass = nullptr;
 };
 
 }
diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index 97d7ab9c94..98f1671cca 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -27,265 +27,112 @@ namespace nbl::asset
 */
 class IPipelineBase
 {
-	public:
-		struct SCreationParams
-		{
-			protected:
-				// This is not public to make sure that different pipelines only get the enums they support
-				enum class FLAGS : uint64_t
-				{
-					NONE = 0, // disallowed in maintanance5
-					DISABLE_OPTIMIZATIONS = 1<<0,
-					ALLOW_DERIVATIVES = 1<<1,
-					
-					// I can just derive this
-					//DERIVATIVE = 1<<2,
+    public:
+      enum class CreationFlags : uint64_t
+      {
+        NONE = 0, // disallowed in maintanance5
+        DISABLE_OPTIMIZATIONS = 1 << 0,
+        ALLOW_DERIVATIVES = 1 << 1,
+
+        // I can just derive this
+        //DERIVATIVE = 1<<2,
+
+        // Graphics Pipelines only
+        //VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3,
+
+        // Compute Pipelines only
+        //DISPATCH_BASE = 1<<4,
+
+        // This is for NV-raytracing extension. Now this is done via IDeferredOperation
+        //DEFER_COMPILE_NV = 1<<5,
+
+        // We use Renderdoc to take care of this for us,
+        // we won't be parsing the statistics and internal representation ourselves.
+        //CAPTURE_STATISTICS = 1<<6,
+        //CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7,
+
+        // Will soon be deprecated due to
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/854
+        FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1 << 8,
+        EARLY_RETURN_ON_FAILURE = 1 << 9,
+
+        // Will be exposed later with the IPipelineLibrary asset implementation
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
+        //LINK_TIME_OPTIMIZATION = 1<<10,
+
+        // Won't be exposed because we'll introduce Libraries as a separate object/asset-type
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
+        //CREATE_LIBRARY = 1<<11,
+
+        // Ray Tracing Pipelines only
+        //SKIP_BUILT_IN_PRIMITIVES = 1<<12,
+        //SKIP_AABBS = 1<<13,
+        //NO_NULL_ANY_HIT_SHADERS = 1<<14,
+        //NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
+        //NO_NULL_MISS_SHADERS = 1<<16,
+        //NO_NULL_INTERSECTION_SHADERS = 1<<17,
+
+        // There is a new Device Generated Commands extension with its own flag that will deprecate this
+        //INDIRECT_BINDABLE_NV = 1<<18,
+
+        // Ray Tracing Pipelines only
+        // For debug tools
+        //RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19,
+
+        // Ray Tracing Pipelines only
+        //ALLOW_MOTION = 1<<20,
+
+        // Graphics Pipelineonly (we don't support subpass shading)
+        //RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21,
+        //RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22,
+
+        // Will be exposed later with the IPipelineLibrary asset implementation
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
+        //RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23,
+
+        // Ray Tracing Pipelines only
+        //RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24,
+
+        // Not supported yet, and we will move to dynamic rendering, so this might never be supported
+        //COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25,
+        //DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26,
+
+        // Not Supported Yet
+        //NO_PROTECTED_ACCESS=1<<27,
+        //RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28,
+        //DESCRIPTOR_VUFFER_BIT=1<<29,
+        //PROTECTED_ACCESS_ONLY=1<<30,
+      };
+
+      // Nabla requires device's reported subgroup size to be between 4 and 128
+      enum class SUBGROUP_SIZE : uint8_t
+      {
+        // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
+        UNKNOWN = 0,
+        // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
+        VARYING = 1,
+        // The rest we encode as log2(x) of the required value
+        REQUIRE_4 = 2,
+        REQUIRE_8 = 3,
+        REQUIRE_16 = 4,
+        REQUIRE_32 = 5,
+        REQUIRE_64 = 6,
+        REQUIRE_128 = 7
+      };
 
-					// Graphics Pipelines only
-					//VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3,
-					
-					// Compute Pipelines only
-					//DISPATCH_BASE = 1<<4,
-					
-					// This is for NV-raytracing extension. Now this is done via IDeferredOperation
-					//DEFER_COMPILE_NV = 1<<5,
-
-					// We use Renderdoc to take care of this for us,
-					// we won't be parsing the statistics and internal representation ourselves.
-					//CAPTURE_STATISTICS = 1<<6,
-					//CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7,
-
-					// Will soon be deprecated due to
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/854
-					FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1<<8,
-					EARLY_RETURN_ON_FAILURE = 1<<9,
-
-					// Will be exposed later with the IPipelineLibrary asset implementation
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
-					//LINK_TIME_OPTIMIZATION = 1<<10,
-
-					// Won't be exposed because we'll introduce Libraries as a separate object/asset-type
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
-					//CREATE_LIBRARY = 1<<11,
-
-					// Ray Tracing Pipelines only
-					//SKIP_BUILT_IN_PRIMITIVES = 1<<12,
-					//SKIP_AABBS = 1<<13,
-					//NO_NULL_ANY_HIT_SHADERS = 1<<14,
-					//NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
-					//NO_NULL_MISS_SHADERS = 1<<16,
-					//NO_NULL_INTERSECTION_SHADERS = 1<<17,
-
-					// There is a new Device Generated Commands extension with its own flag that will deprecate this
-					//INDIRECT_BINDABLE_NV = 1<<18,
-
-					// Ray Tracing Pipelines only
-          // For debug tools
-					//RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19,
-
-					// Ray Tracing Pipelines only
-					//ALLOW_MOTION = 1<<20,
-
-					// Graphics Pipelineonly (we don't support subpass shading)
-					//RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21,
-					//RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22,
-
-					// Will be exposed later with the IPipelineLibrary asset implementation
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
-					//RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23,
-
-					// Ray Tracing Pipelines only
-					//RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24,
-
-					// Not supported yet, and we will move to dynamic rendering, so this might never be supported
-					//COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25,
-					//DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26,
-
-					// Not Supported Yet
-					//NO_PROTECTED_ACCESS=1<<27,
-					//RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28,
-					//DESCRIPTOR_VUFFER_BIT=1<<29,
-					//PROTECTED_ACCESS_ONLY=1<<30,
-				};
-		};
-		
-		/*
-			Specialization info contains things such as entry point to a shader,
-			specialization map entry, required subgroup size, etc. for a blob of SPIR-V
-
-			It also handles Specialization Constants.
-
-			In Vulkan, all shaders get halfway-compiled into SPIR-V and
-			then then lowered (compiled) into the HW ISA by the Vulkan driver.
-			Normally, the half-way compile folds all constant values
-			and optimizes the code that uses them.
-
-			But, it would be nice every so often to have your Vulkan
-			program sneak into the halfway-compiled SPIR-V binary and
-			manipulate some constants at runtime. This is what
-			Specialization Constants are for.
-
-			So A Specialization Constant is a way of injecting an integer
-			constant into a halfway-compiled version of a shader right
-			before the lowering and linking when creating a pipeline.
-
-			Without Specialization Constants, you would have to commit
-			to a final value before the SPIR-V compilation
-		*/
-		template <bool IsMutable>
-		struct SShaderSpecInfo final
-		{
-
-			//! Structure specifying a specialization map entry
-			/*
-				Note that if specialization constant ID is used
-				in a shader, \bsize\b and \boffset'b must match 
-				to \isuch an ID\i accordingly.
-
-				By design the API satisfies:
-				https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773
-				https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774
-			*/
-			//!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
-			using spec_constant_id_t = uint32_t;
-			struct SSpecConstantValueImmutable
-			{
-				const void* data = nullptr;
-				//!< The byte size of the specialization constant value within the supplied data buffer.
-				uint32_t size = 0;
-
-				inline operator bool() const {return data&&size;}
-				
-				auto operator<=>(const SSpecConstantValueImmutable&) const = default;
-			};
-
-			struct SSPecConstantValueMutable
-			{
-				core::vector<uint8_t> data;
-				inline operator bool() const { return data.size(); }
-				auto operator<=>(const SSPecConstantValueMutable&) const = default;
-			};
-
-			using SSpecConstantValue = std::conditional_t<IsMutable, SSPecConstantValueMutable, SSpecConstantValueImmutable>;
-
-			inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const
-			{
-				if (!entries)
-					return { nullptr,0u };
-
-				const auto found = entries->find(_specConstID);
-				if (found != entries->end() && bool(found->second))
-					return found->second;
-				else
-					return { nullptr,0u };
-			}
-
-			// Nabla requires device's reported subgroup size to be between 4 and 128
-			enum class SUBGROUP_SIZE : uint8_t
-			{
-				// No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
-				UNKNOWN = 0,
-				// Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
-				VARYING = 1,
-				// The rest we encode as log2(x) of the required value
-				REQUIRE_4 = 2,
-				REQUIRE_8 = 3,
-				REQUIRE_16 = 4,
-				REQUIRE_32 = 5,
-				REQUIRE_64 = 6,
-				REQUIRE_128 = 7
-			};
-
-			//
-			static constexpr int32_t INVALID_SPEC_INFO = -1;
-			// Returns negative on failure, otherwise the size of the buffer required to reserve for the spec constant data 
-			inline int32_t valid() const
-			{
-				if (!shader || hlsl::bitCount(stage)!=1)
-					return INVALID_SPEC_INFO;
-
-				// Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707
-				if (entryPoint.empty())
-					return INVALID_SPEC_INFO;
-
-				// Shader stages already checked for validity w.r.t. features enabled, during unspec shader creation, only check:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-08988
-				if (requireFullSubgroups)
-				switch (stage)
-				{
-					case hlsl::ShaderStage::ESS_COMPUTE: [[fallthrough]];
-					case hlsl::ShaderStage::ESS_TASK: [[fallthrough]];
-					case hlsl::ShaderStage::ESS_MESH:
-						break;
-					default:
-						return INVALID_SPEC_INFO;
-						break;
-				}
-				// Impossible to efficiently check anything from:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708
-				// to:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686
-				// and from:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
-				// to:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
-					
-				int64_t specData = 0;
-				if (entries)
-				for (const auto& entry : *entries)
-				{
-					if (!entry.second)
-						return INVALID_SPEC_INFO;
-					specData += entry.second.size;
-				}
-				if (specData>0x7fffffff)
-					return INVALID_SPEC_INFO;
-				return static_cast<int32_t>(specData);
-			}
-
-			using shader_ptr_t = std::conditional_t<IsMutable, core::smart_refctd_ptr<IShader>, const IShader*>;
-			using entry_point_t = std::conditional_t<IsMutable, std::string, std::string_view>;
-			using spec_constant_map_t = core::unordered_map<spec_constant_id_t,SSpecConstantValue>;
-			using entries_t = std::conditional_t<IsMutable, spec_constant_map_t, const spec_constant_map_t*>;
-
-			shader_ptr_t shader = nullptr;
-			// A name of the function where the entry point of an shader executable begins. It's often "main" function.
-			entry_point_t entryPoint = {};
-			// stage must be set
-			hlsl::ShaderStage stage = hlsl::ShaderStage::ESS_UNKNOWN;
-			// there's some padding here
-			SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
-			// Valid only for Compute, Mesh and Task shaders
-			uint8_t requireFullSubgroups : 1 = false;
-			// Container choice implicitly satisfies:
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
-			entries_t entries = nullptr;
-			// By requiring Nabla Core Profile features we implicitly satisfy:
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
-			// Also because our API is sane, it satisfies the following by construction:
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
-
-		};
 };
 template<typename PipelineLayout>
 class IPipeline : public IPipelineBase
 {
-	public:
-		// For now, due to API design we implicitly satisfy a bunch of VUIDs
-		struct SCreationParams : protected IPipelineBase::SCreationParams
-		{
-			public:
-				const PipelineLayout* layout = nullptr;
-		};
+    public:
+      inline const PipelineLayout* getLayout() const {return m_layout.get();}
 
-		inline const PipelineLayout* getLayout() const {return m_layout.get();}
+    protected:
 
-	protected:
-		inline IPipeline(core::smart_refctd_ptr<const PipelineLayout>&& _layout)
-      : m_layout(std::move(_layout)) {}
+      inline IPipeline(core::smart_refctd_ptr<const PipelineLayout>&& _layout)
+        : m_layout(std::move(_layout)) {}
 
-		core::smart_refctd_ptr<const PipelineLayout> m_layout;
+      core::smart_refctd_ptr<const PipelineLayout> m_layout;
 };
 
 }
diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index ba29cc58e2..8c5fc017d9 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -7,20 +7,19 @@
 
 #include "nbl/asset/IPipeline.h"
 
-#include "nbl/video/SPipelineCreationParams.h"
+#include "nbl/video/IGPUPipeline.h"
 #include "nbl/video/SPipelineCreationParams.h"
 
 
 namespace nbl::video
 {
 
-class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const IGPUPipelineLayout>
+class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipelineLayout>>
 {
         using pipeline_t = asset::IPipeline<const IGPUPipelineLayout>;
-        using spec_info_t = SShaderSpecInfo<false>;
 
     public:
-        struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPUComputePipeline>
+        struct SCreationParams final : SPipelineCreationParams<const IGPUComputePipeline>
         {
             // By construction we satisfy from:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-03365
@@ -29,7 +28,7 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
             // and:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-07367
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-07996
-            #define base_flag(F) static_cast<uint64_t>(pipeline_t::SCreationParams::FLAGS::F)
+            #define base_flag(F) static_cast<uint64_t>(pipeline_t::CreationFlags::F)
             enum class FLAGS : uint64_t
             {
                 NONE = base_flag(NONE),
@@ -51,7 +50,7 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
                 if (dataSize<0)
                     return {};
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-stage-00701
-                if (!layout || shader.stage!=hlsl::ShaderStage::ESS_COMPUTE)
+                if (!m_layout || shader.stage!=hlsl::ShaderStage::ESS_COMPUTE)
                     return {};
 
                 uint32_t count = 0;
@@ -64,11 +63,11 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
                 return {.count=dataSize ? count:0,.dataSize=static_cast<uint32_t>(dataSize)};
             }
 
-            inline std::span<const spec_info_t> getShaders() const {return {&shader,1}; }
+            inline std::span<const SShaderSpecInfo> getShaders() const {return {&shader,1}; }
 
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
-            spec_info_t shader = {};
+            SShaderSpecInfo shader = {};
         };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const {return m_flags;}
@@ -78,9 +77,8 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
 
     protected:
         inline IGPUComputePipeline(core::smart_refctd_ptr<const IGPUPipelineLayout>&& _layout, const core::bitflag<SCreationParams::FLAGS> _flags) :
-            IBackendObject(core::smart_refctd_ptr<const ILogicalDevice>(_layout->getOriginDevice())),
-            pipeline_t(std::move(_layout)),
-            m_flags(_flags) {}
+          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(_layout->getOriginDevice()), std::move(_layout)), m_flags(_flags)
+        {}
         virtual ~IGPUComputePipeline() = default;
 
         const core::bitflag<SCreationParams::FLAGS> m_flags;
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 4838d7f4d3..9fba0c4a4a 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -6,20 +6,21 @@
 
 #include "nbl/video/IGPUPipelineLayout.h"
 #include "nbl/video/IGPURenderpass.h"
+#include "nbl/video/IGPUPipeline.h"
 
 
 namespace nbl::video
 {
 
-class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline<asset::IPipelineBase::SShaderSpecInfo<false>, const IGPUPipelineLayout,const IGPURenderpass>
+class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const IGPUPipelineLayout, const IGPURenderpass>>
 {
-        using pipeline_t = asset::IGraphicsPipeline<asset::IPipelineBase::SShaderSpecInfo<false>, const IGPUPipelineLayout,const IGPURenderpass>;
+        using pipeline_t = asset::IGraphicsPipeline<const IGPUPipelineLayout,const IGPURenderpass>;
 
     public:
-		struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPUGraphicsPipeline>
-		{
+        struct SCreationParams final : public SPipelineCreationParams<const IGPUGraphicsPipeline>
+        {
             public:
-            #define base_flag(F) static_cast<uint64_t>(pipeline_t::SCreationParams::FLAGS::F)
+            #define base_flag(F) static_cast<uint64_t>(pipeline_t::CreationFlags::F)
             enum class FLAGS : uint64_t
             {
                 NONE = base_flag(NONE),
@@ -31,12 +32,53 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel
             };
             #undef base_flag
 
+            template<typename ExtraLambda>
+            inline bool impl_valid(ExtraLambda&& extra) const
+            {
+                if (!layout)
+                    return false;
+
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
+                if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount())
+                    return false;
+
+                // TODO: check rasterization samples, etc.
+                //rp->getCreationParameters().subpasses[i]
+
+                core::bitflag<hlsl::ShaderStage> stagePresence = {};
+                for (const auto info : shaders)
+                if (info.shader)
+                {
+                    if (!extra(info))
+                        return false;
+                    const auto stage = info.stage;
+                    if (stage>hlsl::ShaderStage::ESS_FRAGMENT)
+                        return false;
+                    if (stagePresence.hasFlags(stage))
+                        return false;
+                    stagePresence |= stage;
+                }
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
+                if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
+                    return false;
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730
+                if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION))
+                    return false;
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889
+                if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==asset::EPT_PATCH_LIST))
+                    return false;
+                
+                return true;
+            }
+
             inline SSpecializationValidationResult valid() const
             {
                 if (!layout)
                     return {};
                 SSpecializationValidationResult retval = {.count=0,.dataSize=0};
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool
+                const bool valid = impl_valid([&retval](const SShaderSpecInfo& info)->bool
                 {
                     const auto dataSize = info.valid();
                     if (dataSize<0)
@@ -55,11 +97,16 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel
                 return retval;
             }
 
-            inline std::span<const IPipelineBase::SShaderSpecInfo> getShaders() const {return shaders;}
+            inline std::span<const SShaderSpecInfo> getShaders() const {return shaders;}
+
+            IGPUPipelineLayout* layout = nullptr;
+            std::span<const SShaderSpecInfo> shaders = {};
+            SCachedCreationParams cached = {};
+            renderpass_t* renderpass = nullptr;
 
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
-		};
+        };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const {return m_flags;}
 
@@ -67,8 +114,9 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel
         virtual const void* getNativeHandle() const = 0;
 
     protected:
-        IGPUGraphicsPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice())),
-            pipeline_t(params), m_flags(params.flags) {}
+        IGPUGraphicsPipeline(const SCreationParams& params) :
+          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags)
+        {}
         virtual ~IGPUGraphicsPipeline() = default;
 
         const core::bitflag<SCreationParams::FLAGS> m_flags;
diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
new file mode 100644
index 0000000000..0761d5d020
--- /dev/null
+++ b/include/nbl/video/IGPUPipeline.h
@@ -0,0 +1,110 @@
+
+
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
+#define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IPipeline.h"
+
+namespace nbl::video
+{
+
+class IGPUPipelineBase {
+    public:
+        struct SShaderSpecInfo
+        {
+            //! Structure specifying a specialization map entry
+            /*
+              Note that if specialization constant ID is used
+              in a shader, \bsize\b and \boffset'b must match 
+              to \isuch an ID\i accordingly.
+
+              By design the API satisfies:
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774
+            */
+            //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
+            using spec_constant_id_t = uint32_t;
+
+            struct SSpecConstantValue
+            {
+                std::span<const uint8_t> data;
+                inline operator bool() const { return data.size(); }
+                inline size_t size() const { return data.size(); }
+            };
+
+            inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const
+            {
+                if (!entries) return {};
+
+                const auto found = entries->find(_specConstID);
+                if (found != entries->end() && bool(found->second)) return found->second;
+                else return {};
+            }
+
+            static constexpr int32_t INVALID_SPEC_INFO = -1;
+            inline int32_t valid() const
+            {
+                if (!shader) return INVALID_SPEC_INFO;
+
+                // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707
+                if (entryPoint.empty()) return INVALID_SPEC_INFO;
+
+                // Impossible to efficiently check anything from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686
+                // and from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
+                  
+                int64_t specData = 0;
+                for (const auto& entry : *entries)
+                {
+                  if (!entry.second)
+                      return INVALID_SPEC_INFO;
+                  specData += entry.second.size();
+                }
+                if (specData>0x7fffffff)
+                    return INVALID_SPEC_INFO;
+                return static_cast<int32_t>(specData);
+            }
+
+            const asset::IShader* shader = nullptr;
+            std::string_view entryPoint = "";
+            asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+            uint8_t requireFullSubgroups : 1 = false;
+
+            // Container choice implicitly satisfies:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
+            const core::unordered_map<spec_constant_id_t, SSpecConstantValue>* entries;
+            // By requiring Nabla Core Profile features we implicitly satisfy:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
+            // Also because our API is sane, it satisfies the following by construction:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
+
+        };
+
+};
+
+// Common Base class for pipelines
+template<typename PipelineNonAssetBase>
+class IGPUPipeline : public IBackendObject, public PipelineNonAssetBase, public IGPUPipelineBase
+{
+    protected:
+
+        template <typename... Args>
+        explicit IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>&& device, Args&&... args) :
+         PipelineNonAssetBase(std::forward<Args>(args...)), IBackendObject(std::move(device))
+        {}
+        virtual ~IGPUPipeline() = default;
+
+};
+
+}
+
+#endif
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index fb8c371193..c41ed333a1 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -10,7 +10,7 @@
 namespace nbl::video
 {
 
-class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingPipeline<const IGPUPipelineLayout>
+class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<const IGPUPipelineLayout>>
 {
         using pipeline_t = asset::IRayTracingPipeline<const IGPUPipelineLayout>;
 
@@ -30,8 +30,28 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP
             uint16_t intersection;
         };
 
-        struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPURayTracingPipeline>
+        using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array<SGeneralShaderGroup>;
+        using SHitShaderGroupContainer = core::smart_refctd_dynamic_array<SHitShaderGroup>;
+
+        struct SCreationParams final : SPipelineCreationParams<const IGPURayTracingPipeline>
         {
+            #define base_flag(F) static_cast<uint64_t>(IPipelineBase::CreationFlags::F)
+            enum class FLAGS : uint64_t
+            {
+                NONE = base_flag(NONE),
+                DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
+                ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
+                FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
+                EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
+                SKIP_BUILT_IN_PRIMITIVES = 1<<12,
+                SKIP_AABBS = 1<<13,
+                NO_NULL_ANY_HIT_SHADERS = 1<<14,
+                NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
+                NO_NULL_MISS_SHADERS = 1<<16,
+                NO_NULL_INTERSECTION_SHADERS = 1<<17,
+                ALLOW_MOTION = 1<<20,
+            };
+            #undef base_flag
 
             inline SSpecializationValidationResult valid() const
             {
@@ -42,7 +62,7 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP
                     .count=0,
                     .dataSize=0,
                 };
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
+                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool
                 {
                     const auto dataSize = info.valid();
                     if (dataSize<0)
@@ -61,8 +81,9 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP
                 return retval;
             }
 
-            inline std::span<const asset::IPipelineBase::SShaderSpecInfo> getShaders() const { return shaders; }
+            inline std::span<const spec_info_t> getShaders() const { return shaders; }
 
+            IGPUPipelineLayout* layout = nullptr;
         };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const { return m_flags; }
@@ -82,8 +103,7 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP
         virtual uint16_t getDefaultStackSize() const = 0;
 
     protected:
-        IGPURayTracingPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice())),
-            pipeline_t(params),
+        IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params),
             m_flags(params.flags)
         {}
 
diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h
index 489bff4343..969559d941 100644
--- a/include/nbl/video/SPipelineCreationParams.h
+++ b/include/nbl/video/SPipelineCreationParams.h
@@ -49,7 +49,7 @@ struct SPipelineCreationParams
 		return basePipelineIndex!=NotDerivingFromPreviousPipeline || basePipeline;
 	}
 
-	// If you set this, then we don't take `basePipelineIndex` into account, the pointer takes precedence
+  
 	const PipelineType* basePipeline = nullptr;
 	int32_t basePipelineIndex = NotDerivingFromPreviousPipeline;
 };

From a9d5aafcf6188116acb92d3177cb27f1236e9951 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 5 May 2025 17:15:56 +0700
Subject: [PATCH 029/346] Fix gpu graphics pipeline stage validation

---
 include/nbl/asset/ICPUGraphicsPipeline.h |  8 --------
 include/nbl/asset/IGraphicsPipeline.h    | 15 +++++++++++++++
 include/nbl/video/IGPUGraphicsPipeline.h | 13 +++++--------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index b93b8165aa..0f90f1213d 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -100,14 +100,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return nullptr;
         }
 
-        inline int8_t stageToIndex(const hlsl::ShaderStage stage) const
-        {
-            const auto stageIx = hlsl::findLSB(stage);
-            if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
-              return -1;
-            return stageIx;
-        }
-
         std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
 };
 
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index 3e029e76b2..d7ccf598ed 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -91,6 +91,21 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
         inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;}
         inline const renderpass_t* getRenderpass() const {return m_renderpass.get();}
 
+        static inline int8_t stageToIndex(const hlsl::ShaderStage stage)
+        {
+            const auto stageIx = hlsl::findLSB(stage);
+            if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
+              return -1;
+            return stageIx;
+        }
+
+        static inline hlsl::ShaderStage indexToStage(const int8_t index)
+        {
+            if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT)
+                return hlsl::ShaderStage::ESS_UNKNOWN;
+            return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
+        }
+
     protected:
         explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) :
             IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 9fba0c4a4a..50c09123cb 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -46,18 +46,15 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 //rp->getCreationParameters().subpasses[i]
 
                 core::bitflag<hlsl::ShaderStage> stagePresence = {};
-                for (const auto info : shaders)
-                if (info.shader)
+                for (auto shader_i = 0u; shader_i < shaders.size(); shader_i++)
                 {
+                    const auto& info = shaders[shader_i];
                     if (!extra(info))
                         return false;
-                    const auto stage = info.stage;
-                    if (stage>hlsl::ShaderStage::ESS_FRAGMENT)
-                        return false;
-                    if (stagePresence.hasFlags(stage))
-                        return false;
-                    stagePresence |= stage;
+                    if (info.shader)
+                        stagePresence |= indexToStage(shader_i);
                 }
+
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
                 if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
                     return false;

From 51b69c1574e89b6b8ff1ac67b748cb7b6f200e77 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 5 May 2025 17:35:42 +0700
Subject: [PATCH 030/346] Fix compute pipeline

---
 include/nbl/video/IGPUComputePipeline.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 8c5fc017d9..66eb1dba96 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -50,7 +50,7 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipel
                 if (dataSize<0)
                     return {};
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-stage-00701
-                if (!m_layout || shader.stage!=hlsl::ShaderStage::ESS_COMPUTE)
+                if (!layout)
                     return {};
 
                 uint32_t count = 0;
@@ -65,6 +65,7 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipel
 
             inline std::span<const SShaderSpecInfo> getShaders() const {return {&shader,1}; }
 
+            IGPUPipelineLayout* layout = nullptr;
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
             SShaderSpecInfo shader = {};
@@ -76,8 +77,8 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipel
         virtual const void* getNativeHandle() const = 0;
 
     protected:
-        inline IGPUComputePipeline(core::smart_refctd_ptr<const IGPUPipelineLayout>&& _layout, const core::bitflag<SCreationParams::FLAGS> _flags) :
-          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(_layout->getOriginDevice()), std::move(_layout)), m_flags(_flags)
+        inline IGPUComputePipeline(const SCreationParams& params) :
+          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), core::smart_refctd_ptr<const IGPUPipelineLayout>(params.layout)), m_flags(params.flags)
         {}
         virtual ~IGPUComputePipeline() = default;
 

From fa759beec86b44dbcf317502870b5d4d713f8e5d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 5 May 2025 17:55:00 +0700
Subject: [PATCH 031/346] Implement cpu graphics pipeline validation

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 15 +++++++++++++--
 include/nbl/asset/IGraphicsPipeline.h    | 16 ++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 0f90f1213d..c477d42834 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -77,8 +77,19 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 
         inline virtual bool valid() const override final
         {
-            // TODO(kevinyu): Fix this temporary stub code
-            return true;
+            if (!m_layout) return false;
+
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
+            if (!m_renderpass || m_params.subpassIx >= m_renderpass->getSubpassCount()) return false;
+            
+            core::bitflag<hlsl::ShaderStage> stagePresence = {};
+            for (auto shader_i = 0u; shader_i < m_specInfos.size(); shader_i++)
+            {
+                const auto& info = m_specInfos[shader_i];
+                if (info.shader)
+                    stagePresence |= indexToStage(shader_i);
+            }
+            return isValidStagePresence(stagePresence, m_params.primitiveAssembly.primitiveType);
         }
 
     protected:
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index d7ccf598ed..f47cee0fa2 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -106,6 +106,22 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
             return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
         }
 
+        static inline bool isValidStagePresence(const core::bitflag<hlsl::ShaderStage>& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType)
+        {
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
+            if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
+                return false;
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730
+            if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION))
+                return false;
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889
+            if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(primitiveType==asset::EPT_PATCH_LIST))
+                return false;
+            return true;
+        }
+
     protected:
         explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) :
             IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))

From de8813feca93e7afdb4a78b37df24987e7c59f48 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 5 May 2025 18:10:14 +0700
Subject: [PATCH 032/346] Implement compute pipeline validation

---
 include/nbl/asset/ICPUComputePipeline.h  |  7 +++----
 include/nbl/asset/ICPUGraphicsPipeline.h |  2 +-
 include/nbl/video/IGPUGraphicsPipeline.h | 15 ++-------------
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index d9bc8dd646..480f601fc0 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -43,14 +43,13 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
         inline virtual std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) override final
         {
             if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable())
-                return {m_specInfo,1};
+                return {&m_specInfo,1};
             return {};
         }
 
         inline virtual bool valid() const override final
         {
-            // TODO(kevinyu): Fix this temporary dummy code
-            return true;
+            return m_specInfo.valid();
         }
 
     protected:
@@ -66,7 +65,7 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
 
 
     private:
-        SShaderSpecInfo<true> m_specInfo;
+        SShaderSpecInfo m_specInfo;
 
 };
 
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index c477d42834..7d139d6fe9 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -94,7 +94,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 
     protected:
         using base_t::base_t;
-        ~ICPUGraphicsPipeline() = default;
+        virtual ~ICPUGraphicsPipeline() override = default;
 
         inline IAsset* getDependant_impl(const size_t ix) override
         {
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 50c09123cb..53ec20244f 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -55,19 +55,8 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                         stagePresence |= indexToStage(shader_i);
                 }
 
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
-                if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
-                    return false;
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730
-                if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION))
-                    return false;
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889
-                if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==asset::EPT_PATCH_LIST))
-                    return false;
+                return isValidStagePresence(stagePresence, cached.primitiveAssembly.primitiveType);
                 
-                return true;
             }
 
             inline SSpecializationValidationResult valid() const
@@ -114,7 +103,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
         IGPUGraphicsPipeline(const SCreationParams& params) :
           IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags)
         {}
-        virtual ~IGPUGraphicsPipeline() = default;
+        virtual ~IGPUGraphicsPipeline() override = default;
 
         const core::bitflag<SCreationParams::FLAGS> m_flags;
 };

From 37ab1ce1b34a82b495581e6f01c0f4c5f6329301 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 17:53:26 +0700
Subject: [PATCH 033/346] Add FLAGS alias

---
 include/nbl/asset/IPipeline.h            | 1 +
 include/nbl/video/IGPUComputePipeline.h  | 2 +-
 include/nbl/video/IGPUGraphicsPipeline.h | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index 98f1671cca..c458c34afe 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -103,6 +103,7 @@ class IPipelineBase
         //DESCRIPTOR_VUFFER_BIT=1<<29,
         //PROTECTED_ACCESS_ONLY=1<<30,
       };
+      using FLAGS = CreationFlags;
 
       // Nabla requires device's reported subgroup size to be between 4 and 128
       enum class SUBGROUP_SIZE : uint8_t
diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 66eb1dba96..6e825d749b 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -28,7 +28,7 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipel
             // and:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-07367
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-07996
-            #define base_flag(F) static_cast<uint64_t>(pipeline_t::CreationFlags::F)
+            #define base_flag(F) static_cast<uint64_t>(pipeline_t::FLAGS::F)
             enum class FLAGS : uint64_t
             {
                 NONE = base_flag(NONE),
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 53ec20244f..fc596a54e1 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -20,7 +20,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
         struct SCreationParams final : public SPipelineCreationParams<const IGPUGraphicsPipeline>
         {
             public:
-            #define base_flag(F) static_cast<uint64_t>(pipeline_t::CreationFlags::F)
+            #define base_flag(F) static_cast<uint64_t>(pipeline_t::FLAGS::F)
             enum class FLAGS : uint64_t
             {
                 NONE = base_flag(NONE),

From a0ecd505814f71309de538a994f141397d9e0bcd Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 17:55:00 +0700
Subject: [PATCH 034/346] Fix clone_impl to return smart pointer

---
 include/nbl/asset/ICPUComputePipeline.h  |  6 +++---
 include/nbl/asset/ICPUGraphicsPipeline.h |  6 +++---
 include/nbl/asset/ICPUPipeline.h         | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 480f601fc0..656e8faf6f 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -27,11 +27,11 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
             return core::smart_refctd_ptr<ICPUComputePipeline>(retval,core::dont_grab);
         }
 
-        inline base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto newPipeline = new ICPUComputePipeline(std::move(layout));
-            newPipeline->m_specInfo = newPipeline->cloneSpecInfo(m_specInfo, depth);
-            return newPipeline;
+            newPipeline->m_specInfo = m_specInfo.clone(depth);
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
 
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 7d139d6fe9..915a4a43c2 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -29,7 +29,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
         }
 
-        inline base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
             for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++)
@@ -39,10 +39,10 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             
             for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
             {
-                newPipeline->m_specInfos[specInfo_i] = newPipeline->cloneSpecInfo(m_specInfos[specInfo_i], depth);
+                newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth);
             }
 
-            return newPipeline;
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
 
         constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 623d5ae2df..3b48ea43f7 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -121,13 +121,13 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
 
         inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
         {
+            if (!getLayout()) return nullptr;
+
             core::smart_refctd_ptr<ICPUPipelineLayout> layout;
-            if (_depth>0u && getLayout()) 
+            if (_depth > 0u) 
               layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(getLayout->clone(_depth-1u));
 
-            auto* newPipeline = clone_impl(std::move(layout), _depth);
-
-            return core::smart_refctd_ptr<this_t>(newPipeline,core::dont_grab);
+            return clone_impl(std::move(layout), _depth);
         }
 
         SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth)
@@ -145,7 +145,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
         using PipelineNonAssetBase::PipelineNonAssetBase;
         virtual ~ICPUPipeline() = default;
         
-        virtual this_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const = 0;
+        virtual core::smart_refctd_ptr<this_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const = 0;
 
 };
 

From 7890981b72e366e62d6a0e0f9d364e3cf82bb5d4 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 17:57:40 +0700
Subject: [PATCH 035/346] Add final decoration to ICPUComputePipeline

---
 include/nbl/asset/ICPUComputePipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 656e8faf6f..0869277911 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -12,7 +12,7 @@ namespace nbl::asset
 {
 
 //! CPU Version of Compute Pipeline
-class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
+class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
 {
         using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>>;
 

From 9a14aa175333af1170e80aa2811ba8df6e684111 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 17:58:15 +0700
Subject: [PATCH 036/346] Make cpu pipeline constructor private

---
 include/nbl/asset/ICPUComputePipeline.h  | 7 ++++---
 include/nbl/asset/ICPUGraphicsPipeline.h | 8 +++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 0869277911..78dc324b50 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -17,9 +17,6 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
         using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>>;
 
     public:
-        explicit ICPUComputePipeline(const ICPUPipelineLayout* layout):
-          base_t(core::smart_refctd_ptr<ICPUPipelineLayout>(layout))
-          {}
 
         static core::smart_refctd_ptr<ICPUComputePipeline> create(const ICPUPipelineLayout* layout)
         {
@@ -67,6 +64,10 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
     private:
         SShaderSpecInfo m_specInfo;
 
+        explicit ICPUComputePipeline(const ICPUPipelineLayout* layout):
+          base_t(core::smart_refctd_ptr<ICPUPipelineLayout>(layout))
+          {}
+
 };
 
 }
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 915a4a43c2..2492329a63 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -19,9 +19,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         using base_t = ICPUPipeline<pipeline_base_t>;
 
     public:
-        explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout)
-            : base_t(layout, {}, {})
-            {}
         
         static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(const ICPUPipelineLayout* layout)
         {
@@ -112,6 +109,11 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         }
 
         std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
+
+    private:
+        explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout)
+            : base_t(layout, {}, {})
+            {}
 };
 
 }

From 9bb9d1411780cfa708609a031f3ecbc13cb276d8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 18:00:14 +0700
Subject: [PATCH 037/346] Add layout validation to compute pipeline validation

---
 include/nbl/asset/ICPUComputePipeline.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 78dc324b50..9db06dbde1 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -44,8 +44,11 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
             return {};
         }
 
+
         inline virtual bool valid() const override final
         {
+            if (!m_layout) return false;
+            if (!m_layout->valid()) return false;
             return m_specInfo.valid();
         }
 

From bcb096f97dfad9c69b21c50988ab5da1f4c50456 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 18:03:28 +0700
Subject: [PATCH 038/346] Refactor getSpecInfo

---
 include/nbl/asset/ICPUComputePipeline.h  |  2 +-
 include/nbl/asset/ICPUGraphicsPipeline.h |  7 +++----
 include/nbl/asset/ICPUPipeline.h         | 16 +++++-----------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 9db06dbde1..5f933878b4 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -37,7 +37,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
         //!
         inline size_t getDependantCount() const override { return 2; }
 
-        inline virtual std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) override final
+        inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
         {
             if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable())
                 return {&m_specInfo,1};
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 2492329a63..fb82bd5608 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -62,16 +62,15 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return m_params;
         }
 
-        inline virtual std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) override final
+        inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
         {
             const auto stageIndex = stageToIndex(stage);
-            if (isMutable() && stageIndex != -1)
-            {
+            if (stageIndex != -1)
                 return { &m_specInfos[stageIndex], 1 };
-            }
             return {};
         }
 
+
         inline virtual bool valid() const override final
         {
             if (!m_layout) return false;
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 3b48ea43f7..fa77c40b7e 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -88,11 +88,7 @@ class ICPUPipelineBase
 
         };
 
-        virtual std::span<SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) = 0;
-        inline std::span<const SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) const
-        {
-            return getSpecInfo(stage);
-        }
+        virtual std::span<const SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) const = 0;
 
         virtual bool valid() const = 0;
 };
@@ -131,13 +127,11 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
         }
 
         SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth)
+        inline std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage)
         {
-            auto newSpecInfo = specInfo;
-            if (depth>0u)
-            {
-                newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(specInfo.shader->clone(depth - 1u));
-            }
-            return newSpecInfo;
+            if (!isMutable()) return {};
+            const auto specInfo = static_cast<const this_t*>(this)->getSpecInfo(stage);
+            return { const_cast<SShaderSpecInfo*>(specInfo.data()), specInfo.size() };
         }
 
     protected:

From 278eb715bcd2e25168f36f30626213a8561ef4f7 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 18:04:31 +0700
Subject: [PATCH 039/346] Move stageToIndex and indexToStage

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 15 +++++++++++++++
 include/nbl/asset/IGraphicsPipeline.h    | 14 --------------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index fb82bd5608..926ee0ca6c 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -113,6 +113,21 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout)
             : base_t(layout, {}, {})
             {}
+
+        static inline int8_t stageToIndex(const hlsl::ShaderStage stage)
+        {
+            const auto stageIx = hlsl::findLSB(stage);
+            if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
+              return -1;
+            return stageIx;
+        }
+
+        static inline hlsl::ShaderStage indexToStage(const int8_t index)
+        {
+            if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT)
+                return hlsl::ShaderStage::ESS_UNKNOWN;
+            return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
+        }
 };
 
 }
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index f47cee0fa2..859c80b0b7 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -91,20 +91,6 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
         inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;}
         inline const renderpass_t* getRenderpass() const {return m_renderpass.get();}
 
-        static inline int8_t stageToIndex(const hlsl::ShaderStage stage)
-        {
-            const auto stageIx = hlsl::findLSB(stage);
-            if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
-              return -1;
-            return stageIx;
-        }
-
-        static inline hlsl::ShaderStage indexToStage(const int8_t index)
-        {
-            if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT)
-                return hlsl::ShaderStage::ESS_UNKNOWN;
-            return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
-        }
 
         static inline bool isValidStagePresence(const core::bitflag<hlsl::ShaderStage>& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType)
         {

From 68bbcff2d77b6635ed8e1e2cbfc154cda8e12029 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 18:08:34 +0700
Subject: [PATCH 040/346] Add constraint to template parameter of ICPUPipeline
 and IGPUPipeline

---
 include/nbl/asset/ICPUPipeline.h | 1 +
 include/nbl/video/IGPUPipeline.h | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index fa77c40b7e..8f41de59ec 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -95,6 +95,7 @@ class ICPUPipelineBase
 
 // Common Base class for pipelines
 template<typename PipelineNonAssetBase>
+    requires (std::is_base_of_v<IPipeline<ICPUPipelineLayout>, PipelineNonAssetBase> && !std::is_base_of_v<IAsset, PipelineNonAssetBase>)
 class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipelineBase
 {
         using this_t = ICPUPipeline<PipelineNonAssetBase>;
diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 0761d5d020..4a96c9e01f 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -92,14 +92,15 @@ class IGPUPipelineBase {
 };
 
 // Common Base class for pipelines
-template<typename PipelineNonAssetBase>
-class IGPUPipeline : public IBackendObject, public PipelineNonAssetBase, public IGPUPipelineBase
+template<typename PipelineNonBackendObjectBase>
+    requires (std::is_base_of_v<asset::IPipeline<const IGPUPipelineLayout>, PipelineNonBackendObjectBase> && !std::is_base_of_v<IBackendObject, PipelineNonBackendObjectBase>)
+class IGPUPipeline : public IBackendObject, public PipelineNonBackendObjectBase, public IGPUPipelineBase
 {
     protected:
 
         template <typename... Args>
         explicit IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>&& device, Args&&... args) :
-         PipelineNonAssetBase(std::forward<Args>(args...)), IBackendObject(std::move(device))
+         PipelineNonBackendObjectBase(std::forward<Args>(args...)), IBackendObject(std::move(device))
         {}
         virtual ~IGPUPipeline() = default;
 

From 8ec04157beeed8ff19aee04e69ccf9ffeeaa6f17 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 18:50:06 +0700
Subject: [PATCH 041/346] Rework IGPUPipeline SSpecConstantValue

---
 include/nbl/video/IGPUPipeline.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 4a96c9e01f..2a93895b9d 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -28,19 +28,14 @@ class IGPUPipelineBase {
             //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
             using spec_constant_id_t = uint32_t;
 
-            struct SSpecConstantValue
-            {
-                std::span<const uint8_t> data;
-                inline operator bool() const { return data.size(); }
-                inline size_t size() const { return data.size(); }
-            };
+            using SSpecConstantValue = std::span<const uint8_t>;
 
             inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const
             {
                 if (!entries) return {};
 
                 const auto found = entries->find(_specConstID);
-                if (found != entries->end() && bool(found->second)) return found->second;
+                if (found != entries->end() && found->second.size()) return found->second;
                 else return {};
             }
 
@@ -64,7 +59,7 @@ class IGPUPipelineBase {
                 int64_t specData = 0;
                 for (const auto& entry : *entries)
                 {
-                  if (!entry.second)
+                  if (!entry.second.size())
                       return INVALID_SPEC_INFO;
                   specData += entry.second.size();
                 }

From fdb4d40a0e846740c3fab9cb2721e3c3aa19742d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 7 May 2025 14:02:03 +0200
Subject: [PATCH 042/346] create the TLASes and BLASes

TODOs:
- abstract away the staging cache insert/overwrite
- check dependants of TLAS after creation of BLAS
- insert the ASes into staging cache
- collect the BLASes to use during TLAS builds
---
 include/nbl/video/utilities/CAssetConverter.h |  26 ++--
 src/nbl/video/utilities/CAssetConverter.cpp   | 146 +++++++++++-------
 2 files changed, 104 insertions(+), 68 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 02d43cff69..7492e5ed59 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1000,7 +1000,12 @@ class CAssetConverter : public core::IReferenceCounted
 					assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
 					return m_maxASBuildScratchSize[forHostOps];
 				}
-// TODO: `getMinCompactedASAllocatorSpace`
+				// We do all compactions on the Device for simplicity
+				inline uint64_t getMinCompactedASAllocatorSpace() const
+				{
+					assert(m_compactedASMaxMemory == 0 || willDeviceASBuild() || willHostASBuild());
+					return m_compactedASMaxMemory;
+				}
 				// tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild`
 				inline bool willDeviceASBuild() const {return getMinASBuildScratchSize(false)>0;}
 				// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
@@ -1013,8 +1018,7 @@ class CAssetConverter : public core::IReferenceCounted
 				// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
 				inline bool willCompactAS() const
 				{
-					assert(!m_willCompactSomeAS || willDeviceASBuild() || willHostASBuild());
-					return m_willCompactSomeAS;
+					return getMinCompactedASAllocatorSpace()!=0;
 				}
 
 				//
@@ -1106,29 +1110,23 @@ class CAssetConverter : public core::IReferenceCounted
 				template<typename CPUAccelerationStructure>
 				struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
 				{
-					constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
-					inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
-
 					using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
 					inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
 					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
 
-
-					uint64_t scratchSize;
-					uint64_t compactedASWriteOffset : 48 = WontCompact;
-					uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
+					uint64_t scratchSize : 45;
+					uint64_t compact : 1;
+					uint64_t buildFlags : 16 = 0;
 				};
 				using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
 				core::vector<SConvReqBLAS> m_blasConversions[2];
 				using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
 				core::vector<SConvReqTLAS> m_tlasConversions[2];
 
-				// 0 for device builds, 1 for host builds
+				// array index 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
 				uint64_t m_maxASBuildScratchSize[2] = {0,0};
-// TODO: make the compaction count the size
-				// We do all compactions on the Device for simplicity
-				uint8_t m_willCompactSomeAS : 1 = false;
+				uint64_t m_compactedASMaxMemory = 0;
 				// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
 				struct BLASUsedInTLASBuild
 				{
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 1f9ca46462..0d98609c2c 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2530,13 +2530,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// BLAS and TLAS creation is somewhat delayed by buffer creation and allocation
 		struct DeferredASCreationParams
 		{
+			const IAccelerationStructure* canonical;
 			asset_cached_t<ICPUBuffer> storage;
-			size_t scratchSize : 62 = 0;
+			size_t scratchSize : 45 = 0;
 			size_t motionBlur : 1 = false;
+			size_t buildFlags : 16 = 0;
+			size_t hostBuild : 1 = false;
 			size_t compactAfterBuild : 1 = false;
 #ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
 			size_t inputSize = 0;
-			uint32_t maxInstanceCount = 0;
 #endif
 		};
 		core::vector<DeferredASCreationParams> accelerationStructureParams[2];
@@ -2721,25 +2723,25 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				{
 					const auto* as = entry.second.canonicalAsset;
 					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
-					const bool motionBlur = as->usesMotion();
+					const bool motionBlur = patch.isMotion;
+					const auto buildFlags = patch.getBuildFlags(as);
+					const auto outIx = i+entry.second.firstCopyIx;
+					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
 					ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-					// we will need to temporarily store the build input buffers somewhere
-					size_t inputSize = 0;
+//					size_t inputSize = 0;
 					{
-						const auto buildFlags = patch.getBuildFlags(as);
 						if constexpr (IsTLAS)
 						{
 							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
 								{visitBase},
-								{asset,uniqueCopyGroupID},
+								{as,uniqueCopyGroupID},
 								patch
 							};
 							if (!visitor())
 								continue;
 							const auto instanceCount = as->getInstances().size();
 							sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount);
-							inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount;
+//							inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount;
 						}
 						else
 						{
@@ -2762,14 +2764,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
 									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
-									for (const auto& geom : geoms)
-									if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
-										inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride;
+//									for (const auto& geom : geoms)
+//									if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
+//										inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride;
 								}
 							}
 							else
 							{
-								core::map<uint32_t,size_t> allocationsPerStride;
+//								core::map<uint32_t,size_t> allocationsPerStride;
 								const auto geoms = as->getTriangleGeometries();
 								if (patch.hostBuild)
 								{
@@ -2784,6 +2786,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+#if 0
 									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
 									for (const auto& geom : geoms)
 									if (const auto triCount=*(pMaxPrimitiveCounts++); triCount)
@@ -2804,17 +2807,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 											bytesPerVertex += bytesPerVertex;
 										allocationsPerStride[geom.vertexStride] += geom.maxVertex;
 									}
+#endif
 								}
-								for (const auto& entry : allocationsPerStride)
-									inputSize = core::roundUp<size_t>(inputSize,entry.first)+entry.first*entry.second;
+//								for (const auto& entry : allocationsPerStride)
+//									inputSize = core::roundUp<size_t>(inputSize,entry.first)+entry.first*entry.second;
 							}
 						}
 					}
 					if (!sizes)
 						continue;
-#endif
+
 					// we need to save the buffer in a side-channel for later
 					auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
+					out.canonical = as;
 					// this is where it gets a bit weird, we need to create a buffer to back the acceleration structure
 					{
 						IGPUBuffer::SCreationParams params = {};
@@ -2822,8 +2827,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment);
 						params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 						// concurrent ownership if any
-						const auto outIx = i + entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
 						const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch);
 						params.queueFamilyIndexCount = queueFamilies.size();
 						params.queueFamilyIndices = queueFamilies.data();
@@ -2831,6 +2834,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					}
 					out.scratchSize = sizes.buildScratchSize;
 					out.motionBlur = motionBlur;
+					out.buildFlags = static_cast<uint16_t>(buildFlags.value);
+					out.hostBuild = patch.hostBuild;
 					out.compactAfterBuild = patch.compactAfterBuild;
 					if (out.storage && !deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
 						out.storage.value = nullptr;
@@ -3276,6 +3281,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 
 			// Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse
+			// TODO: this probably could go at the end of the object creation routines
 			// This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures
 			if constexpr (!std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> && !std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
 				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
@@ -3304,6 +3310,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					// unhashables were not supposed to be added to conversion requests
 					assert(contentHash!=CHashCache::NoContentHash);
 
+// abstract away start
 					const auto copyIx = found->second.firstCopyIx++;
 					// the counting sort was stable
 					assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]);
@@ -3333,6 +3340,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
 					// propagate back to dfsCache
 					created.gpuObj = std::move(gpuObj);
+// abstract away end
 					// record if a device memory allocation will be needed
 					if constexpr (std::is_base_of_v<IDeviceMemoryBacked,typename asset_traits<AssetType>::video_t>)
 					{
@@ -3351,7 +3359,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						const uint16_t recomputeMips = created.patch.recomputeMips;
 						retval.m_imageConversions.emplace_back(SReserveResult::SConversionRequestBase<asset::ICPUImage>{core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()},recomputeMips);
 					}
-// TODO: BLAS and TLAS requests
 				}
 			);
 		};
@@ -3392,51 +3399,72 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		}
 		// Deal with Deferred Creation of Acceleration structures
 		{
-			for (auto asLevel=0; asLevel<2; asLevel++)
+			const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
+			auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
 			{
-				// each of these stages must have a barrier inbetween
-				size_t scratchSizeFullParallelBuild = 0;
-				size_t scratchSizeFullParallelCompact = 0;
+				constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
+				// TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
+				size_t scratchSizeFullParallelBuild[2] = {0,0};
+				//
+				core::vector<SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
+				if constexpr (IsTLAS)
+					pConversions = retval.m_tlasConversions;
+				else
+					pConversions = retval.m_blasConversions;
 				// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
-				for (const auto& deferredParams : accelerationStructureParams[asLevel])
+				for (const auto& deferredParams : accelerationStructureParams[IsTLAS])
 				{
 					// buffer failed to create/allocate
 					if (!deferredParams.storage)
 						continue;
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
+					const auto bufSz = deferredParams.storage.get()->getSize();
 					IGPUAccelerationStructure::SCreationParams baseParams;
 					{
-						auto* buf = deferredParams.storage.get();
-						const auto bufSz = buf->getSize();
 						using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
 						baseParams = {
-							.bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr<IGPUBuffer>(buf)},
+							.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
 							.flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE
 						};
 					}
-					smart_refctd_ptr<IGPUAccelerationStructure> as;
-					if (asLevel)
+					//
+					auto& request = pConversions[deferredParams.hostBuild].emplace_back();
+					request.canonical = smart_refctd_ptr<const AccelerationStructure>(static_cast<const AccelerationStructure*>(deferredParams.canonical));
+					smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
+					if constexpr (IsTLAS)
 					{
-						as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount});
+						// is there any reason for it to be more?
+						const uint32_t maxInstances = request.canonical->getInstances().size();
+						as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances});
 					}
 					else
+						as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
+					request.gpuObj = as.get();
+					if (!request.gpuObj)
 					{
-						as = device->createTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount});
+						inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
+						continue;
 					}
+					request.scratchSize = deferredParams.scratchSize;
+					request.compact = deferredParams.compactAfterBuild;
+					request.buildFlags = deferredParams.buildFlags;
+					// best case
+					size_t buildSize = 0;
+// TODO: compute inputs with alignment
+					buildSize = core::alignUp(buildSize,minScratchAlignment)+deferredParams.scratchSize;
+					// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
+					retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],buildSize);
+					scratchSizeFullParallelBuild[deferredParams.hostBuild] += buildSize;
 					// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
-// TODO: compute with alignment
-					const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize;
-					// sizes for building 1-by-1 vs parallel, note that
-					retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize);
-					scratchSizeFullParallelBuild += buildSize;
-					// triangles, AABBs or Instance Transforms will need to be supplied from VRAM
-#endif
+					if (deferredParams.compactAfterBuild)
+						retval.m_compactedASMaxMemory += bufSz;
 				}
-				// 
-//				retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild,retval.m_maxASBuildScratchSize);
-			}
+				retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
+				retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
+			};
+			createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
+			createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
 			//
-			if (retval.willDeviceASBuild())
+			if (retval.willDeviceASBuild() || retval.willCompactAS())
 				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 		}
 
@@ -3555,13 +3583,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				return retval;
 			}
 			using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS;
-			constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			// we may use the staging buffer directly to skip an extra copy on small enough geometries
-			if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags))
-			{
-				logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR);
-				return retval;
-			}
+			constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
 			const auto& scratchParams = scratchBuffer->getCachedCreationParams();
@@ -3583,6 +3605,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR);
 				return retval;
 			}
+			// this alignment is probably bigger than required by any Build Input
 			const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
 			if (addrAlloc.max_alignment()<minScratchAlignment)
 			{
@@ -3607,6 +3630,17 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily);
 					return retval;
 				}
+				if (!scratchBuffer->getCreationParams().usage.hasFlags(buffer_usage_f::EUF_TRANSFER_DST_BIT))
+				{
+					logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and doesn't the transfer destination usage flag!",system::ILogger::ELL_ERROR);
+					return retval;
+				}
+				// Right now we copy from staging to scratch, but in the future we may use the staging buffer directly to skip an extra copy on small enough geometries
+				if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags|buffer_usage_f::EUF_TRANSFER_SRC_BIT))
+				{
+					logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!", system::ILogger::ELL_ERROR);
+					return retval;
+				}
 				reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
 			}
 		}
@@ -3617,10 +3651,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			return retval;
 		}
 		// and compacting
-		if (reservations.willCompactAS() && !params.compactedASAllocator)
+		if (reservations.willCompactAS())
 		{
-			logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR);
-			return retval;
+			if (!params.compactedASAllocator)
+			{
+				logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR);
+				return retval;
+			}
+			// note that can't check the compacted AS allocator being large enough against `reservations.m_compactedASMaxMemory`
 		}
 
 		//
@@ -4741,7 +4779,7 @@ if (worstSize>minScratchSize)
 						// no special extra byte offset into the instance buffer
 						rangeInfos.emplace_back(instanceCount,0u);
 						//
-						const bool willCompact = tlasToBuild.compact();
+						const bool willCompact = tlasToBuild.compact;
 						if (willCompact)
 							compactions.push_back(as);
 						// enqueue ownership release if necessary

From 57136e8cb8bd148a18286859ac8d79def85b9039 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:16:58 +0700
Subject: [PATCH 043/346] Rework SShaderSpecInfo for ICPUPIpeline

---
 include/nbl/asset/ICPUGraphicsPipeline.h |  2 +-
 include/nbl/asset/ICPUPipeline.h         | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 926ee0ca6c..62b25443cc 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -85,7 +85,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
                 if (info.shader)
                     stagePresence |= indexToStage(shader_i);
             }
-            return isValidStagePresence(stagePresence, m_params.primitiveAssembly.primitiveType);
+            return hasRequiredStages(stagePresence, m_params.primitiveAssembly.primitiveType);
         }
 
     protected:
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 8f41de59ec..ddfb4628c8 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -31,17 +31,12 @@ class ICPUPipelineBase
             //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
             using spec_constant_id_t = uint32_t;
 
-            struct SSpecConstantValue
-            {
-                core::vector<uint8_t> data;
-                inline operator bool() const { return data.size(); }
-                inline size_t size() const { return data.size(); }
-            };
+            using SSpecConstantValue = core::vector<uint8_t>;
 
             inline SSpecConstantValue* getSpecializationByteValue(const spec_constant_id_t _specConstID)
             {
                 const auto found = entries.find(_specConstID);
-                if (found != entries.end() && bool(found->second)) return &found->second;
+                if (found != entries.end() && found->second.size()) return &found->second;
                 else return nullptr;
             }
 
@@ -65,7 +60,7 @@ class ICPUPipelineBase
                 int64_t specData = 0;
                 for (const auto& entry : entries)
                 {
-                    if (!entry.second) return INVALID_SPEC_INFO;
+                    if (!entry.second.size()) return INVALID_SPEC_INFO;
                     specData += entry.second.size();
                 }
                 if (specData > 0x7fffffff) return INVALID_SPEC_INFO;

From 7983e62a27f29c906d64a2152d4033dd9a28a185 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:17:15 +0700
Subject: [PATCH 044/346] Move cloneSpecInfo into SShaderSpecInfo

---
 include/nbl/asset/ICPUPipeline.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index ddfb4628c8..69d709d1d0 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -81,6 +81,15 @@ class ICPUPipelineBase
             // Also because our API is sane, it satisfies the following by construction:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
 
+            SShaderSpecInfo clone(uint32_t depth) const
+            {
+                auto newSpecInfo = *this;
+                if (depth > 0u)
+                {
+                    newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(this->shader->clone(depth - 1u));
+                }
+                return newSpecInfo;
+            }
         };
 
         virtual std::span<const SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) const = 0;
@@ -122,7 +131,6 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
             return clone_impl(std::move(layout), _depth);
         }
 
-        SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth)
         inline std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return {};

From 071f1ebbb0da7090d097233248b15e25092580b6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:17:45 +0700
Subject: [PATCH 045/346] Remove valid virtual function from ICPUPipeline to
 IAsset

---
 include/nbl/asset/IAsset.h       | 2 ++
 include/nbl/asset/ICPUPipeline.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index fdb41ed298..3b8b123ce3 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -169,6 +169,8 @@ class IAsset : virtual public core::IReferenceCounted
 			return retval;
 		}
 
+    virtual bool valid() const = 0;
+
     protected:
 		inline IAsset() = default;
 		//! Pure virtual destructor to ensure no instantiation
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 69d709d1d0..8b90458f21 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -94,7 +94,6 @@ class ICPUPipelineBase
 
         virtual std::span<const SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) const = 0;
 
-        virtual bool valid() const = 0;
 };
 
 // Common Base class for pipelines

From b8f8ba04db3e44aa30e1c265bd108367a2707b19 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:18:33 +0700
Subject: [PATCH 046/346] Remove getShaders from SShaderSpecInfo

---
 include/nbl/video/IGPUComputePipeline.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 6e825d749b..42503e1f12 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -63,8 +63,6 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipel
                 return {.count=dataSize ? count:0,.dataSize=static_cast<uint32_t>(dataSize)};
             }
 
-            inline std::span<const SShaderSpecInfo> getShaders() const {return {&shader,1}; }
-
             IGPUPipelineLayout* layout = nullptr;
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;

From f661366d6a35cc83fe578436ce0b168f99d381de Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:21:20 +0700
Subject: [PATCH 047/346] Rename isValidStagePresence to hasRequiredStages

---
 include/nbl/asset/IGraphicsPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index 859c80b0b7..ef49e4c03a 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -92,7 +92,7 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
         inline const renderpass_t* getRenderpass() const {return m_renderpass.get();}
 
 
-        static inline bool isValidStagePresence(const core::bitflag<hlsl::ShaderStage>& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType)
+        static inline bool hasRequiredStages(const core::bitflag<hlsl::ShaderStage>& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType)
         {
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
             if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))

From 8c10cbdaba40e8fa0eb569ef8e00f39f181d065d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:25:01 +0700
Subject: [PATCH 048/346] Rework IGPUGraphicsPipeline to have individual
 shaderSpecInfo per stages

---
 include/nbl/video/IGPUGraphicsPipeline.h | 31 +++++++++++++++---------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index fc596a54e1..f5d6e40275 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -46,16 +46,21 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 //rp->getCreationParameters().subpasses[i]
 
                 core::bitflag<hlsl::ShaderStage> stagePresence = {};
-                for (auto shader_i = 0u; shader_i < shaders.size(); shader_i++)
-                {
-                    const auto& info = shaders[shader_i];
-                    if (!extra(info))
-                        return false;
-                    if (info.shader)
-                        stagePresence |= indexToStage(shader_i);
-                }
 
-                return isValidStagePresence(stagePresence, cached.primitiveAssembly.primitiveType);
+                auto processSpecInfo = [&](const SShaderSpecInfo& specInfo, hlsl::ShaderStage stage)
+                {
+                    if (!extra(specInfo)) return false;
+                    if (!specInfo.shader) return false;
+                    stagePresence != stage;
+                    return true;
+                };
+                if (!processSpecInfo(vertexShader)) return false;
+                if (!processSpecInfo(tesselationControlShader)) return false;
+                if (!processSpecInfo(tesselationEvaluationShader)) return false;
+                if (!processSpecInfo(geometryShader)) return false;
+                if (!processSpecInfo(fragmentShader)) return false;
+                
+                return hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType);
                 
             }
 
@@ -83,10 +88,12 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 return retval;
             }
 
-            inline std::span<const SShaderSpecInfo> getShaders() const {return shaders;}
-
             IGPUPipelineLayout* layout = nullptr;
-            std::span<const SShaderSpecInfo> shaders = {};
+            SShaderSpecInfo vertexShader;
+            SShaderSpecInfo tesselationControlShader;
+            SShaderSpecInfo tesselationEvaluationShader;
+            SShaderSpecInfo geometryShader;
+            SShaderSpecInfo fragmentShader;
             SCachedCreationParams cached = {};
             renderpass_t* renderpass = nullptr;
 

From 71056f2274b118a0ed6c22aff5b1f5ff5d96e133 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 7 May 2025 19:25:24 +0700
Subject: [PATCH 049/346] Add IGPUPipelineLayout to IGPUPipeline

---
 include/nbl/video/IGPUPipeline.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 2a93895b9d..826026d9aa 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -6,6 +6,7 @@
 #ifndef _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
 #define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
 
+#include "nbl/video/IGPUPipelineLayout.h"
 #include "nbl/asset/IPipeline.h"
 
 namespace nbl::video

From 802ff9aefc3dfca7028034232789f1b31e89aa0e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 7 May 2025 14:38:48 +0200
Subject: [PATCH 050/346] get the thing to compile and estimate build input
 size

---
 include/nbl/video/IGPUAccelerationStructure.h |  3 +++
 src/nbl/video/utilities/CAssetConverter.cpp   | 21 ++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 5d8f0ca29b..c3a24080d0 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -638,6 +638,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 				// I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed
 				SRTMotionInstance<blas_ref_t> largestUnionMember = {};
 				static_assert(alignof(SRTMotionInstance<blas_ref_t>)==8ull);
+
+			public:
+				constexpr static inline size_t LargestUnionMemberSize = sizeof(largestUnionMember);
 		};
 		using DevicePolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::device_op_ref_t>;
 		using HostPolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::host_op_ref_t>;
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 0d98609c2c..0134991976 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -3447,10 +3447,21 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					request.scratchSize = deferredParams.scratchSize;
 					request.compact = deferredParams.compactAfterBuild;
 					request.buildFlags = deferredParams.buildFlags;
-					// best case
-					size_t buildSize = 0;
-// TODO: compute inputs with alignment
-					buildSize = core::alignUp(buildSize,minScratchAlignment)+deferredParams.scratchSize;
+					// prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
+					// TODO: take into account the minimal allocation size from the allocator (ask for it)
+					size_t buildSize = deferredParams.scratchSize;
+					if constexpr (IsTLAS)
+					{
+						const uint32_t instanceCount = request.canonical->getInstances().size();
+						// Worst case approximation, not all instances will be that size (note that host and device instance data are same size)
+						const size_t approxInstanceSize = deferredParams.motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
+						buildSize = core::alignUp(buildSize,approxInstanceSize)+instanceCount*approxInstanceSize;
+						buildSize = core::alignUp(buildSize,alignof(uint64_t))+instanceCount*sizeof(uint64_t);
+					}
+					else
+					{
+// TODO: compute BLAS input size with alignments
+					}
 					// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
 					retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],buildSize);
 					scratchSizeFullParallelBuild[deferredParams.hostBuild] += buildSize;
@@ -4652,7 +4663,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
 						const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
 						{
-							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8};
+							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)};
 /* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
 {
 addr_t worstSize = sizes[0];

From 8f1911221f474f159c844070c2b0b61ef9333bb6 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 7 May 2025 16:45:59 +0200
Subject: [PATCH 051/346] check the GPU BLASes needed for TLAS build exist
 before creating TLAS

Finish getting build sizes for BLAS and TLAS

Also ask for minimum allocation size constraint of the allocator
---
 include/nbl/video/utilities/CAssetConverter.h |   3 +
 src/nbl/video/utilities/CAssetConverter.cpp   | 147 +++++++++---------
 2 files changed, 80 insertions(+), 70 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 7492e5ed59..3e134b913d 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -900,6 +900,9 @@ class CAssetConverter : public core::IReferenceCounted
 			IGPUPipelineCache* pipelineCache = nullptr;
 			// optional, defaults to the device
 			IDeviceMemoryAllocator* allocator = nullptr;
+			// optional, defaults to worst case (Apple Silicon page size)
+			uint32_t scratchForDeviceASBuildMinAllocSize = 1<<14;
+			uint32_t scratchForHostASBuildMinAllocSize = 1<<14;
         };
 		// Split off from inputs because only assets that build on IPreHashed need uploading
 		struct SConvertParams
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 0134991976..6ab4b319d2 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1655,6 +1655,8 @@ template<>
 class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
 {
 	public:
+		// all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit
+		uint64_t buildInputSize = 0;
 		// because of zero access to the lifetime tracking between TLASes and BLASes, do nothing
 		//core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>* const outBLASes;
 
@@ -1668,6 +1670,9 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 			auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
 			if (!depObj)
 				return false;
+			const auto instances = user.asset->getInstances();
+			assert(instanceIndex<instances.size());
+			buildInputSize += ITopLevelAccelerationStructure::getInstanceSize(instances[instanceIndex].getType());
 			// outBLASes[instanceIndex] = std::move(depObj);
 			return true;
 		}
@@ -2531,15 +2536,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		struct DeferredASCreationParams
 		{
 			const IAccelerationStructure* canonical;
-			asset_cached_t<ICPUBuffer> storage;
-			size_t scratchSize : 45 = 0;
-			size_t motionBlur : 1 = false;
-			size_t buildFlags : 16 = 0;
-			size_t hostBuild : 1 = false;
-			size_t compactAfterBuild : 1 = false;
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-			size_t inputSize = 0;
-#endif
+			asset_cached_t<ICPUBuffer> storage = {};
+			uint64_t scratchSize : 45 = 0;
+			uint64_t motionBlur : 1 = false;
+			uint64_t buildFlags : 16 = 0;
+			uint64_t hostBuild : 1 = false;
+			uint64_t compactAfterBuild : 1 = false;
+			uint64_t buildSize = 0;
 		};
 		core::vector<DeferredASCreationParams> accelerationStructureParams[2];
 		// Deduplication, Creation and Propagation
@@ -2547,6 +2550,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		{
 			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
 			// This map contains the assets by-hash, identical asset+patch hash the same.
+			// It only has entries for GPU objects that need to be created
 			conversions_t<AssetType> conversionRequests;
 
 			// We now go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions.
@@ -2727,8 +2731,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					const auto buildFlags = patch.getBuildFlags(as);
 					const auto outIx = i+entry.second.firstCopyIx;
 					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+					// prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
+					const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize;
+					uint64_t buildSize = 0; uint32_t buildAlignment = 4;
+					auto incrementBuildSize = [minScratchAllocSize,&buildSize,&buildAlignment](const uint64_t size, const uint32_t alignment)->void
+					{
+						buildSize = core::alignUp(buildSize,alignment)+hlsl::max<uint64_t>(size,minScratchAllocSize);
+						buildAlignment = hlsl::max(buildAlignment,alignment);
+					};
 					ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
-//					size_t inputSize = 0;
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(entry.first.data);
 					{
 						if constexpr (IsTLAS)
 						{
@@ -2738,10 +2750,17 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								patch
 							};
 							if (!visitor())
+							{
+								inputs.logger.log(
+									"Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx",
+									system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+								);
 								continue;
+							}
 							const auto instanceCount = as->getInstances().size();
 							sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount);
-//							inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount;
+							incrementBuildSize(visitor.buildInputSize,16);
+							incrementBuildSize(sizeof(uint64_t)*instanceCount,alignof(uint64_t));
 						}
 						else
 						{
@@ -2763,15 +2782,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
-									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
-//									for (const auto& geom : geoms)
-//									if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
-//										inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride;
 								}
+								// TODO: check if the strides need to be aligned to 4 bytes for AABBs
+								for (const auto& geom : geoms)
+								if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
+									incrementBuildSize(aabbCount*geom.stride,alignof(float));
 							}
 							else
 							{
-//								core::map<uint32_t,size_t> allocationsPerStride;
+								core::map<uint32_t,size_t> allocationsPerStride;
 								const auto geoms = as->getTriangleGeometries();
 								if (patch.hostBuild)
 								{
@@ -2786,36 +2805,38 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
-#if 0
-									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
-									for (const auto& geom : geoms)
-									if (const auto triCount=*(pMaxPrimitiveCounts++); triCount)
+								}
+								for (const auto& geom : geoms)
+								if (const auto triCount=*(pMaxPrimitiveCounts++); triCount)
+								{
+									switch (geom.indexType)
 									{
-										switch (geom.indexType)
-										{
-											case E_INDEX_TYPE::EIT_16BIT:
-												allocationsPerStride[sizeof(uint16_t)] += triCount*3;
-												break;
-											case E_INDEX_TYPE::EIT_32BIT:
-												allocationsPerStride[sizeof(uint32_t)] += triCount*3;
-												break;
-											default:
-												break;
-										}
-										size_t bytesPerVertex = geom.vertexStride;
-										if (geom.vertexData[1])
-											bytesPerVertex += bytesPerVertex;
-										allocationsPerStride[geom.vertexStride] += geom.maxVertex;
+										case E_INDEX_TYPE::EIT_16BIT:
+											allocationsPerStride[sizeof(uint16_t)] += triCount*3;
+											break;
+										case E_INDEX_TYPE::EIT_32BIT:
+											allocationsPerStride[sizeof(uint32_t)] += triCount*3;
+											break;
+										default:
+											break;
 									}
-#endif
+									allocationsPerStride[geom.vertexStride] += (geom.vertexData[1] ? 2:1)*geom.maxVertex;
 								}
-//								for (const auto& entry : allocationsPerStride)
-//									inputSize = core::roundUp<size_t>(inputSize,entry.first)+entry.first*entry.second;
+								for (const auto& entry : allocationsPerStride)
+									incrementBuildSize(entry.first*entry.second,entry.first);
 							}
 						}
 					}
-					if (!sizes)
+					if (!buildSize)
+					{
+						inputs.logger.log(
+							"Build Size Input is 0 for Acceleration Structure %8llx%8llx%8llx%8llx",
+							system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+						);
 						continue;
+					}
+					// scratch gets allocated first
+					buildSize = core::alignUp(hlsl::max<uint64_t>(sizes.buildScratchSize,minScratchAllocSize),buildAlignment)+buildSize;
 
 					// we need to save the buffer in a side-channel for later
 					auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
@@ -2831,14 +2852,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						params.queueFamilyIndexCount = queueFamilies.size();
 						params.queueFamilyIndices = queueFamilies.data();
 						out.storage.value = device->createBuffer(std::move(params));
+						if (out.storage)
+						if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
+						{
+							out.storage.value = nullptr;
+							continue;
+						}
 					}
 					out.scratchSize = sizes.buildScratchSize;
 					out.motionBlur = motionBlur;
 					out.buildFlags = static_cast<uint16_t>(buildFlags.value);
 					out.hostBuild = patch.hostBuild;
 					out.compactAfterBuild = patch.compactAfterBuild;
-					if (out.storage && !deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
-						out.storage.value = nullptr;
+					out.buildSize = buildSize;
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUImage>)
@@ -3447,24 +3473,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					request.scratchSize = deferredParams.scratchSize;
 					request.compact = deferredParams.compactAfterBuild;
 					request.buildFlags = deferredParams.buildFlags;
-					// prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
-					// TODO: take into account the minimal allocation size from the allocator (ask for it)
-					size_t buildSize = deferredParams.scratchSize;
-					if constexpr (IsTLAS)
-					{
-						const uint32_t instanceCount = request.canonical->getInstances().size();
-						// Worst case approximation, not all instances will be that size (note that host and device instance data are same size)
-						const size_t approxInstanceSize = deferredParams.motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-						buildSize = core::alignUp(buildSize,approxInstanceSize)+instanceCount*approxInstanceSize;
-						buildSize = core::alignUp(buildSize,alignof(uint64_t))+instanceCount*sizeof(uint64_t);
-					}
-					else
-					{
-// TODO: compute BLAS input size with alignments
-					}
 					// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
-					retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],buildSize);
-					scratchSizeFullParallelBuild[deferredParams.hostBuild] += buildSize;
+					retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize);
+					scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize;
 					// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
 					if (deferredParams.compactAfterBuild)
 						retval.m_compactedASMaxMemory += bufSz;
@@ -3623,6 +3634,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				logger.log("Accceleration Structure Scratch Device Memory Allocator cannot allocate with Physical Device's minimum required AS-build scratch alignment %u",system::ILogger::ELL_ERROR,minScratchAlignment);
 				return retval;
 			}
+		// TODO: check scratchForDeviceASBuildMinAllocSize
 			// returns non-null pointer if the buffer is writeable directly byt the host
 			deviceASBuildScratchPtr = reinterpret_cast<uint8_t*>(scratchBuffer->getBoundMemory().memory->getMappedPointer());
 			// Need to use Transfer Queue and copy via staging buffer
@@ -3656,10 +3668,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			}
 		}
 		// the elusive and exotic host builds
-		if (reservations.willHostASBuild() && !params.scratchForHostASBuild)
+		if (reservations.willHostASBuild())
 		{
-			logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR);
-			return retval;
+			if (!params.scratchForHostASBuild)
+			{
+				logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR);
+				return retval;
+			}
+			// TODO: check everything else when we actually support host builds
 		}
 		// and compacting
 		if (reservations.willCompactAS())
@@ -4664,14 +4680,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
 						{
 							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)};
-/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
-{
-addr_t worstSize = sizes[0];
-for (auto i=1u; i<AllocCount; i++)
-	worstSize = core::alignUp(worstSize,alignments[i])+sizes[i];
-if (worstSize>minScratchSize)
-	minScratchSize = worstSize;
-}*/
 							const auto AllocCount = as->usesMotion() ? 2:3;
 							// if fail then flush and keep trying till space is made
 							for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
@@ -4692,7 +4700,6 @@ if (worstSize>minScratchSize)
 						// stream the instance/geometry input in
 						{
 							bool success = true;
-// TODO: make sure the overflow submit work callback is doing some CPU work
 							{
 								struct FillInstances : IUtilities::IUpstreamingDataProducer
 								{

From 7b643b68bc27022ddc974ab2b83e95aa0879cc10 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 7 May 2025 18:06:09 +0200
Subject: [PATCH 052/346] Make ReBAR buffer copies happen during `convert` and
 not `reserve` because that's supposed to be the "expensive" call

Also prevent attempting to map the same memory multiple times (relevant in APIs that only allow a single mapping and we suballocate from same `IDeviceMemoryAllocation`)

So now for a ReBAR upload to succeed, the memory allocation given out needs to be on the correct heap AND start off mapped (Asset Converter won't attempt to map by itself).
---
 src/nbl/video/utilities/CAssetConverter.cpp | 106 ++++++++++++--------
 1 file changed, 66 insertions(+), 40 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 6ab4b319d2..19b8f18a66 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2293,6 +2293,15 @@ class MetaDeviceMemoryAllocator final
 		core::map<MemoryRequirementBin,core::vector<memory_backed_ptr_variant_t>> allocationRequests;
 };
 
+// for dem ReBAR goodies
+bool canHostWriteToMemoryRange(const IDeviceMemoryBacked::SMemoryBinding& binding, const size_t length)
+{
+	assert(binding.isValid());
+	const auto* memory = binding.memory;
+	const auto& mappedRange = memory->getMappedRange();
+	return memory->isCurrentlyMapped() && memory->getCurrentMappingAccess().hasFlags(IDeviceMemoryAllocation::EMCAF_WRITE) && mappedRange.offset<=binding.offset && binding.offset+length<=mappedRange.offset+mappedRange.length;
+}
+
 //
 auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 {
@@ -2660,12 +2669,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}();
 
 			core::vector<asset_cached_t<AssetType>> gpuObjects(gpuObjUniqueCopyGroupIDs.size());
-			// Only warn once to reduce log spam
 			auto assign = [&]<bool GPUObjectWhollyImmutable=false>(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj)->bool
 			{
 				const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
 				if constexpr (GPUObjectWhollyImmutable) // including any deps!
-				if (copyIx==1)
+				if (copyIx==1) // Only warn once to reduce log spam
 					inputs.logger.log(
 						"Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?",
 						system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
@@ -3398,31 +3406,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// now allocate the memory for buffers and images
 		deferredAllocator.finalize();
 
-		// can remove buffers from conversion requests which can be written to directly
-		{
-			core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
-			flushRanges.reserve(retval.m_bufferConversions.size());
-			std::erase_if(retval.m_bufferConversions,[&flushRanges](const SReserveResult::SConvReqBuffer& conv)->bool
-				{
-					const auto boundMemory = conv.gpuObj->getBoundMemory();
-					auto* const memory = boundMemory.memory;
-					if (!boundMemory.memory->isMappable())
-						return false;
-					const size_t size = conv.gpuObj->getSize();
-					const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size};
-					// slightly inefficient but oh well
-					void* dst = memory->map(range,IDeviceMemoryAllocation::EMCAF_WRITE);
-					memcpy(dst,conv.canonical->getPointer(),size);
-					if (boundMemory.memory->haveToMakeVisible())
-						flushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
+		// find out which buffers need to be uploaded via a staging buffer
+		std::erase_if(retval.m_bufferConversions,[&](const SReserveResult::SConvReqBuffer& conv)->bool
+			{
+				if (!conv.gpuObj)
 					return true;
-				}
-			);
-			if (!flushRanges.empty())
-				device->flushMappedMemoryRanges(flushRanges);
-			if (!retval.m_bufferConversions.empty())
-				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-		}
+				const auto boundMemory = conv.gpuObj->getBoundMemory();
+				if (!boundMemory.isValid())
+					return true;
+				if (!canHostWriteToMemoryRange(boundMemory,conv.gpuObj->getSize()))
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+				return false;
+			}
+		);
 		// Deal with Deferred Creation of Acceleration structures
 		{
 			const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
@@ -3489,6 +3485,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			if (retval.willDeviceASBuild() || retval.willCompactAS())
 				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 		}
+		std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool {return !conv.gpuObj || !conv.gpuObj->getBoundMemory().isValid();});
+
 
 		dedupCreateProp.template operator()<ICPUBufferView>();
 		dedupCreateProp.template operator()<ICPUImageView>();
@@ -3559,6 +3557,32 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	assert(reservations.m_converter.get()==this);
 	auto device = m_params.device;
 
+	auto hostBufferXferIt = reservations.m_bufferConversions.begin();
+	core::vector<ILogicalDevice::MappedMemoryRange> memoryHostFlushRanges;
+	memoryHostFlushRanges.reserve(reservations.m_bufferConversions.size());
+	auto hostUploadBuffers = [&](auto&& pred)->void
+	{
+		for (; hostBufferXferIt!=reservations.m_bufferConversions.end() && pred(); hostBufferXferIt++)
+		{
+			const size_t size = hostBufferXferIt->gpuObj->getSize();
+			const auto boundMemory = hostBufferXferIt->gpuObj->getBoundMemory();
+			if (!canHostWriteToMemoryRange(boundMemory,size))
+				continue;
+			auto* const memory = boundMemory.memory;
+			const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size};
+			memcpy(reinterpret_cast<uint8_t*>(memory->getMappedPointer())+range.offset,hostBufferXferIt->canonical->getPointer(),size);
+			// let go of canonical asset (may free RAM)
+			hostBufferXferIt->canonical = nullptr;
+			if (memory->haveToMakeVisible())
+				memoryHostFlushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
+		}
+		if (!memoryHostFlushRanges.empty())
+		{
+			device->flushMappedMemoryRanges(memoryHostFlushRanges);
+			memoryHostFlushRanges.clear();
+		}
+	};
+
 	// compacted TLASes need to be substituted in cache and Descriptor Sets
 	core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
 	// Anything to do?
@@ -3825,11 +3849,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			for (auto& item : buffersToUpload)
 			{
 				auto* buffer = item.gpuObj;
-				const SBufferRange<IGPUBuffer> range = {
-					.offset = 0,
-					.size = item.gpuObj->getCreationParams().size,
-					.buffer = core::smart_refctd_ptr<IGPUBuffer>(buffer)
-				};
+				const size_t size = item.gpuObj->getCreationParams().size;
+				// host will upload
+				if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size))
+					continue;
 				auto pFoundHash = findInStaging.template operator()<ICPUBuffer>(buffer);
 				//
 				const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily);
@@ -3839,6 +3862,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					continue;
 				}
 				// do the upload
+				const SBufferRange<IGPUBuffer> range = {.offset=0,.size=size,.buffer=core::smart_refctd_ptr<IGPUBuffer>(buffer)};
 				const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer());
 				// current recording buffer may have changed
 				xferCmdBuf = params.transfer->getCommandBufferForRecording();
@@ -3870,7 +3894,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers END");
 				xferCmdBuf->cmdbuf->endDebugMarker();
 			}
-			buffersToUpload.clear();
 			// release ownership
 			if (!finalReleases.empty())
 				pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers=finalReleases},"Ownership Releases of Buffers Failed");
@@ -3908,15 +3931,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
                 return IQueue::RESULT::OTHER_ERROR;
 			return res;
 		};
-		// compose our overflow callback on top of what's already there, only if we need to ofc 
+
+		// We want to be doing Host operations while stalled for GPU, compose our overflow callback on top of what's already there, only if we need to ofc 
 		auto origXferStallCallback = params.transfer->overflowCallback;
-		if (shouldDoSomeCompute)
-			params.transfer->overflowCallback = [&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void
-			{
-				drainCompute();
-				if (origXferStallCallback)
-					origXferStallCallback(tillScratchResettable);
-			};
+		params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void
+		{
+			drainCompute();
+			if (origXferStallCallback)
+				origXferStallCallback(tillScratchResettable);
+			hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;});
+		};
 		// when overflowing compute resources, we need to submit the Xfer before submitting Compute
 		auto drainBoth = [&params,&xferCmdBuf,&drainCompute](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
 		{
@@ -4987,6 +5011,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		}
 	}
 	
+	// finish host tasks if not done yet
+	hostUploadBuffers([]()->bool{return true;});
 
 	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
 	// want to check if deps successfully exist

From a90ef105242c9bc47074ad189a531d1bbc03fb2d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 8 May 2025 11:41:36 +0200
Subject: [PATCH 053/346] Make the memory requests happen immediately during
 `assign` also set Vulkan debug names on the created GPU objects right away.

Push conversion requests to right after successful `assign` beause that makes more sense.

Prep for host_image_copy support.

Also make should-be-private structs private.
---
 include/nbl/video/utilities/CAssetConverter.h |  28 +--
 src/nbl/video/utilities/CAssetConverter.cpp   | 195 +++++++++---------
 2 files changed, 114 insertions(+), 109 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 3e134b913d..02cc9ab447 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1064,21 +1064,10 @@ class CAssetConverter : public core::IReferenceCounted
 					return enqueueSuccess;
 				}
 
-				// public only because `GetDependantVisit<ICPUDescriptorSet>` needs it
-				struct SDeferredTLASWrite
-				{
-					inline bool operator==(const SDeferredTLASWrite& other) const
-					{
-						return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement;
-					}
-
-					IGPUDescriptorSet* dstSet;
-					uint32_t binding;
-					uint32_t arrayElement;
-					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
-				};
 			private:
 				friend class CAssetConverter;
+				// internal classes
+				template<asset::Asset AssetType> friend class GetDependantVisit;
 
 				inline SReserveResult() = default;
 
@@ -1141,6 +1130,19 @@ class CAssetConverter : public core::IReferenceCounted
 				};
 				using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
 				cpu_to_gpu_blas_map_t m_blasBuildMap;
+				//
+				struct SDeferredTLASWrite
+				{
+					inline bool operator==(const SDeferredTLASWrite& other) const
+					{
+						return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement;
+					}
+
+					IGPUDescriptorSet* dstSet;
+					uint32_t binding;
+					uint32_t arrayElement;
+					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
+				};
 				struct SDeferredTLASWriteHasher
 				{
 					inline size_t operator()(const SDeferredTLASWrite& write) const
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 19b8f18a66..d25dcae4f1 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1657,8 +1657,8 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 	public:
 		// all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit
 		uint64_t buildInputSize = 0;
-		// because of zero access to the lifetime tracking between TLASes and BLASes, do nothing
-		//core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>* const outBLASes;
+		//
+		CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
 
 	protected:
 		bool descend_impl(
@@ -1673,7 +1673,12 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 			const auto instances = user.asset->getInstances();
 			assert(instanceIndex<instances.size());
 			buildInputSize += ITopLevelAccelerationStructure::getInstanceSize(instances[instanceIndex].getType());
-			// outBLASes[instanceIndex] = std::move(depObj);
+			// TODO: deal with usages not going through because of cancelled TLAS builds, find out which BLASes were meant to be built
+			auto foundBLAS = blasBuildMap->find(dep.asset);
+			if (foundBLAS!=blasBuildMap->end())
+				foundBLAS->second.remainingUsages++;
+			else
+				blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}});
 			return true;
 		}
 };
@@ -2669,7 +2674,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}();
 
 			core::vector<asset_cached_t<AssetType>> gpuObjects(gpuObjUniqueCopyGroupIDs.size());
-			auto assign = [&]<bool GPUObjectWhollyImmutable=false>(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj)->bool
+			auto assign = [&]<bool GPUObjectWhollyImmutable=false>(
+				const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj, const AssetType* asset=nullptr
+			)->asset_traits<AssetType>::video_t*
 			{
 				const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
 				if constexpr (GPUObjectWhollyImmutable) // including any deps!
@@ -2685,16 +2692,37 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						"Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx",
 						system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
 					);
-					return false;
+					return nullptr;
 				}
-				gpuObjects[copyIx+baseIx].value = std::move(gpuObj);
-				return true;
+				auto output = gpuObjects.data()+copyIx+baseIx;
+				const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx];
+				if constexpr (std::is_same_v<AssetType,ICPUBuffer> || std::is_same_v<AssetType,ICPUImage>)
+				{
+					const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get());
+					if (!deferredAllocator.request(output,constrainMask))
+						return nullptr;
+				}
+				// set debug names on everything!
+				{
+					std::ostringstream debugName;
+					debugName << "Created by Converter ";
+					debugName << std::hex;
+					debugName << this;
+					debugName << " from Asset with hash ";
+					for (const auto& byte : contentHash.data)
+						debugName << uint32_t(byte) << " ";
+					debugName << "for Group " << uniqueCopyGroupID;
+					gpuObj.get()->setObjectDebugName(debugName.str().c_str());
+				}
+				output->value = std::move(gpuObj);
+				return output->value.get();
 			};
 
 			GetDependantVisitBase<AssetType> visitBase = {
 				.inputs = inputs,
 				.dfsCaches = dfsCaches
 			};
+
 			// Dispatch to correct creation of GPU objects
 			if constexpr (std::is_same_v<AssetType,ICPUSampler>)
 			{
@@ -2707,19 +2735,21 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				for (auto& entry : conversionRequests)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
+					const ICPUBuffer* asset = entry.second.canonicalAsset;
 					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
 					//
 					IGPUBuffer::SCreationParams params = {};
-					params.size = entry.second.canonicalAsset->getSize();
+					params.size = asset->getSize();
 					params.usage = patch.usage;
 					// concurrent ownership if any
 					const auto outIx = i+entry.second.firstCopyIx;
 					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
-					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,entry.second.canonicalAsset,patch);
+					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch);
 					params.queueFamilyIndexCount = queueFamilies.size();
 					params.queueFamilyIndices = queueFamilies.data();
-					// if creation successful, we will upload
-					assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)));
+					// if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion
+					if (IGPUBuffer* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset); gpuObj)
+						retval.m_bufferConversions.push_back({core::smart_refctd_ptr<const ICPUBuffer>(asset),gpuObj});
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> || std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
@@ -2950,19 +2980,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					// gpu image specifics
 					params.tiling = static_cast<IGPUImage::TILING>(patch.linearTiling);
 					params.preinitialized = false;
-					// if creation successful, we check what queues we need if uploading
-					if (assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params))) && !asset->getRegions().empty())
-					{
-						// for now until host_image_copy
-						retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-						// Best effort guess, without actually looking at all regions
-						// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
-						if (isDepthOrStencilFormat(patch.format) && (patch.usageFlags|patch.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
-							retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
-						// only if we upload some data can we recompute the mips
-						if (patch.recomputeMips)
-							retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-					}
+					// if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion (if we have content to upload)
+					if (IGPUImage* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset); gpuObj && !asset->getRegions().empty())
+						retval.m_imageConversions.push_back({{core::smart_refctd_ptr<const ICPUImage>(asset),gpuObj},bool(patch.recomputeMips)});
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
@@ -3314,87 +3334,54 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				}
 			}
 
-			// Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse
-			// TODO: this probably could go at the end of the object creation routines
+			// Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same content hash and GPU object)
 			// This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures
 			if constexpr (!std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> && !std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
 				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-				{
-					auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
-					// already found in read cache and not converted
-					if (created.gpuObj)
-						return;
+					{
+						auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
+						// already found in read cache and not converted
+						if (created.gpuObj)
+							return;
 
-					const auto& contentHash = created.contentHash;
-					auto found = conversionRequests.find(contentHash);
+						const auto& contentHash = created.contentHash;
+						auto found = conversionRequests.find(contentHash);
 
-					const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
+						const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
 
-					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-					// can happen if deps were unconverted dummies
-					if (found==conversionRequests.end())
-					{
-						if (contentHash!=CHashCache::NoContentHash)
-							inputs.logger.log(
-								"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
-								system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-							);
-						return;
-					}
-					// unhashables were not supposed to be added to conversion requests
-					assert(contentHash!=CHashCache::NoContentHash);
+						const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+						// can happen if deps were unconverted dummies
+						if (found==conversionRequests.end())
+						{
+							if (contentHash!=CHashCache::NoContentHash)
+								inputs.logger.log(
+									"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
+									system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+								);
+							return;
+						}
+						// unhashables were not supposed to be added to conversion requests
+						assert(contentHash!=CHashCache::NoContentHash);
 
-// abstract away start
-					const auto copyIx = found->second.firstCopyIx++;
-					// the counting sort was stable
-					assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]);
+						const auto copyIx = found->second.firstCopyIx++;
+						// the counting sort was stable
+						assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]);
 
-					auto& gpuObj = gpuObjects[copyIx];
-					if (!gpuObj)
-					{
-						inputs.logger.log(
-							"Conversion for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
-							system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset
-						);
-						return;
-					}
-					// set debug names on everything!
-					{
-						std::ostringstream debugName;
-						debugName << "Created by Converter ";
-						debugName << std::hex;
-						debugName << this;
-						debugName << " from Asset with hash ";
-						for (const auto& byte : contentHash.data)
-							debugName << uint32_t(byte) << " ";
-						debugName << "for Group " << uniqueCopyGroupID;
-						gpuObj.get()->setObjectDebugName(debugName.str().c_str());
-					}
-					// insert into staging cache
-					stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
-					// propagate back to dfsCache
-					created.gpuObj = std::move(gpuObj);
-// abstract away end
-					// record if a device memory allocation will be needed
-					if constexpr (std::is_base_of_v<IDeviceMemoryBacked,typename asset_traits<AssetType>::video_t>)
-					{
-						const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,instance.asset,contentHash,created.gpuObj.get());
-						if (!deferredAllocator.request(&created.gpuObj,constrainMask))
+						auto& gpuObj = gpuObjects[copyIx];
+						if (!gpuObj)
 						{
-							created.gpuObj.value = nullptr;
+							inputs.logger.log(
+								"Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
+								system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset
+							);
 							return;
 						}
+						// insert into staging cache
+						stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
+						// propagate back to dfsCache
+						created.gpuObj = std::move(gpuObj);
 					}
-					//
-					if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
-						retval.m_bufferConversions.emplace_back(SReserveResult::SConvReqBuffer{core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()});
-					if constexpr (std::is_same_v<AssetType,ICPUImage>)
-					{
-						const uint16_t recomputeMips = created.patch.recomputeMips;
-						retval.m_imageConversions.emplace_back(SReserveResult::SConversionRequestBase<asset::ICPUImage>{core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()},recomputeMips);
-					}
-				}
-			);
+				);
 		};
 		// The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants.
 		// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.
@@ -3409,8 +3396,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// find out which buffers need to be uploaded via a staging buffer
 		std::erase_if(retval.m_bufferConversions,[&](const SReserveResult::SConvReqBuffer& conv)->bool
 			{
-				if (!conv.gpuObj)
-					return true;
+				assert(conv.gpuObj);
 				const auto boundMemory = conv.gpuObj->getBoundMemory();
 				if (!boundMemory.isValid())
 					return true;
@@ -3485,7 +3471,24 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			if (retval.willDeviceASBuild() || retval.willCompactAS())
 				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 		}
-		std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool {return !conv.gpuObj || !conv.gpuObj->getBoundMemory().isValid();});
+		// find out which images need what caps for the transfer and mipmapping
+		std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool
+			{
+				assert(conv.gpuObj);
+				const auto boundMemory = conv.gpuObj->getBoundMemory();
+				if (!boundMemory.isValid())
+					return true;
+				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+				if (conv.recomputeMips)
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+				// Best effort guess, without actually looking at all regions
+				const auto& params = conv.gpuObj->getCreationParameters();
+				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
+				if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
+				return false;
+			}
+		);
 
 
 		dedupCreateProp.template operator()<ICPUBufferView>();

From 11255d4f7d99279851f41ae5f025912f437b73f6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 8 May 2025 20:24:27 +0700
Subject: [PATCH 054/346] Implement ICPURayTracingPipeline

---
 include/nbl/asset/ICPURayTracingPipeline.h | 122 +++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 include/nbl/asset/ICPURayTracingPipeline.h

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
new file mode 100644
index 0000000000..23a1d82225
--- /dev/null
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -0,0 +1,122 @@
+
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_
+#define _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IRayTracingPipeline.h"
+#include "nbl/asset/ICPUPipeline.h"
+
+
+namespace nbl::asset
+{
+
+//! CPU Version of RayTracing Pipeline
+class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICPUPipelineLayout>>
+{
+        using pipeline_base_t = IRayTracingPipeline<ICPUPipelineLayout>;
+        using base_t = ICPUPipeline<pipeline_base_t>;
+
+    public:
+        struct SHitGroupSpecInfo {
+            SShaderSpecInfo closestHit;
+            SShaderSpecInfo anyHit;
+            SShaderSpecInfo intersection;
+
+            SHitGroupSpecInfo clone(uint32_t depth) const
+            {
+                auto newSpecInfo = *this;
+                if (depth > 0u)
+                {
+                    newSpecInfo.closestHit.shader = core::smart_refctd_ptr_static_cast<IShader>(this->closestHit.shader->clone(depth - 1u));
+                    newSpecInfo.anyHit.shader = core::smart_refctd_ptr_static_cast<IShader>(this->anyHit.shader->clone(depth - 1u));
+                    newSpecInfo.intersection.shader = core::smart_refctd_ptr_static_cast<IShader>(this->intersection.shader->clone(depth - 1u));
+                }
+                return newSpecInfo;
+            }
+        };
+
+        static core::smart_refctd_ptr<ICPURayTracingPipeline> create(const ICPUPipelineLayout* layout)
+        {
+            auto retval = new ICPURayTracingPipeline(layout);
+            return core::smart_refctd_ptr<ICPURayTracingPipeline>(retval,core::dont_grab);
+        }
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto newPipeline = new ICPURayTracingPipeline(layout.get());
+            newPipeline->m_raygen = m_raygen.clone(depth);
+
+            newPipeline->m_misses.resize(m_misses.size());
+            for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++)
+            {
+                newPipeline->m_misses[specInfo_i] = m_misses[specInfo_i].clone(depth);
+            }
+
+            newPipeline->m_hitGroups.resize(m_hitGroups.size());
+            for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++)
+            {
+                newPipeline->m_hitGroups[specInfo_i] = m_hitGroups[specInfo_i].clone(depth);
+            }
+
+            newPipeline->m_callables.resize(m_callables.size());
+            for (auto specInfo_i = 0u; specInfo_i < m_callables.size(); specInfo_i++)
+            {
+                newPipeline->m_callables[specInfo_i] = m_callables[specInfo_i].clone(depth);
+            }
+
+            newPipeline->m_params = m_params;
+            return core::smart_refctd_ptr<base_t>(newPipeline);
+        }
+
+        constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
+        inline E_TYPE getAssetType() const override { return AssetType; }
+        
+        //!
+        inline size_t getDependantCount() const override { 
+            //TODO(kevinyu): Implement or refactor the api design to something else
+            return 0;
+        }
+
+        inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
+        {
+          switch (stage) 
+          {
+            case hlsl::ShaderStage::ESS_RAYGEN:
+              return { &m_raygen, 1 };
+          }
+            return {};
+        }
+
+        inline virtual bool valid() const override final
+        {
+            // TODO(kevinyu): Fix this temporary dummy code
+            return true;
+        }
+
+    protected:
+        virtual ~ICPURayTracingPipeline() = default;
+
+        inline IAsset* getDependant_impl(const size_t ix) override
+        {
+            //TODO(kevinyu): remove this function, since this is expensive
+            return nullptr;
+        }
+
+
+    private:
+        
+        SShaderSpecInfo m_raygen;
+        core::vector<SShaderSpecInfo> m_misses;
+        core::vector<SHitGroupSpecInfo> m_hitGroups;
+        core::vector<SShaderSpecInfo> m_callables;
+
+        explicit ICPURayTracingPipeline(const ICPUPipelineLayout* layout)
+            : base_t(layout, {})
+            {}
+
+};
+
+}
+#endif

From 343f3954db9ec73607d5fbe5dd1e4b3641a88f20 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 8 May 2025 17:39:54 +0200
Subject: [PATCH 055/346] prepare the refactor to be able to propagate the
 deferredely created Acceleration Structures to staging cache

---
 src/nbl/video/utilities/CAssetConverter.cpp | 179 +++++++++++---------
 1 file changed, 96 insertions(+), 83 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index d25dcae4f1..b87cbbfdde 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1993,20 +1993,6 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 };
 
 
-//
-template<asset::Asset AssetType>
-struct unique_conversion_t
-{
-	const AssetType* canonicalAsset = nullptr;
-	patch_index_t patchIndex = {};
-	size_t firstCopyIx : 40 = 0u;
-	size_t copyCount : 24 = 1u;
-};
-
-// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs
-template<asset::Asset AssetType>
-using conversions_t = core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>>;
-
 // Needed both for reservation and conversion
 class MetaDeviceMemoryAllocator final
 {
@@ -2307,6 +2293,24 @@ bool canHostWriteToMemoryRange(const IDeviceMemoryBacked::SMemoryBinding& bindin
 	return memory->isCurrentlyMapped() && memory->getCurrentMappingAccess().hasFlags(IDeviceMemoryAllocation::EMCAF_WRITE) && mappedRange.offset<=binding.offset && binding.offset+length<=mappedRange.offset+mappedRange.length;
 }
 
+//
+template<asset::Asset AssetType>
+struct unique_conversion_t
+{
+	const AssetType* canonicalAsset = nullptr;
+	patch_index_t patchIndex = {};
+	size_t firstCopyIx : 40 = 0u;
+	size_t copyCount : 24 = 1u;
+};
+
+// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs
+template<asset::Asset AssetType>
+struct conversions_t
+{
+	core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>> contentHashToCanonical;
+	core::vector<asset_cached_t<AssetType>> gpuObjects;
+};
+
 //
 auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 {
@@ -2544,6 +2548,53 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// can now spawn our own hash cache
 		retval.m_hashCache = core::make_smart_refctd_ptr<CHashCache>();
 
+		// Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object)
+		auto propagateToStagingCache = [&inputs,&dfsCaches,&retval]<Asset AssetType>(conversions_t<AssetType>& conversionRequests)->void
+		{
+			std::get<dfs_cache<AssetType>>(dfsCaches).for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+				{
+					auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
+					// already found in read cache and not converted
+					if (created.gpuObj)
+						return;
+
+					const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
+					const auto& contentHash = created.contentHash;
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+
+					auto& map = conversionRequests.contentHashToCanonical;
+					auto found = map.find(contentHash);
+					// can happen if deps were unconverted dummies
+					if (found==map.end())
+					{
+						if (contentHash!=CHashCache::NoContentHash)
+							inputs.logger.log(
+								"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
+								system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3]
+							);
+						return;
+					}
+					// unhashables were not supposed to be added to conversion requests
+					assert(contentHash!=CHashCache::NoContentHash);
+
+					const auto copyIx = found->second.firstCopyIx++;
+					auto& gpuObj = conversionRequests.gpuObjects[copyIx];
+					if (!gpuObj)
+					{
+						inputs.logger.log(
+							"Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
+							system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset
+						);
+						return;
+					}
+					// insert into staging cache
+					stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
+					// propagate back to dfsCache
+					created.gpuObj = std::move(gpuObj);
+				}
+			);
+		};
+
 		MetaDeviceMemoryAllocator deferredAllocator(inputs.allocator ? inputs.allocator:device,inputs.logger);
 
 		// BLAS and TLAS creation is somewhat delayed by buffer creation and allocation
@@ -2560,7 +2611,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		};
 		core::vector<DeferredASCreationParams> accelerationStructureParams[2];
 		// Deduplication, Creation and Propagation
-		auto dedupCreateProp = [&]<Asset AssetType>()->void
+		auto dedupCreateProp = [&]<Asset AssetType>()->conversions_t<AssetType>
 		{
 			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
 			// This map contains the assets by-hash, identical asset+patch hash the same.
@@ -2623,7 +2674,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					}
 					// then de-duplicate the conversions needed
 					const patch_index_t patchIx = {static_cast<uint64_t>(std::distance(dfsCache.nodes.data(),&created))};
-					auto [inSetIt,inserted] = conversionRequests.emplace(contentHash,unique_conversion_t<AssetType>{.canonicalAsset=instance.asset,.patchIndex=patchIx});
+					auto [inSetIt,inserted] = conversionRequests.contentHashToCanonical.emplace(contentHash,unique_conversion_t<AssetType>{.canonicalAsset=instance.asset,.patchIndex=patchIx});
 					if (!inserted)
 					{
 						// If an element prevented insertion, the patch must be identical!
@@ -2642,7 +2693,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				auto exclScanConvReqs = [&]()->size_t
 				{
 					size_t sum = 0;
-					for (auto& entry : conversionRequests)
+					for (auto& entry : conversionRequests.contentHashToCanonical)
 					{
 						entry.second.firstCopyIx = sum;
 						sum += entry.second.copyCount;
@@ -2655,9 +2706,10 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						if (created.gpuObj)
 							return;
-						auto found = conversionRequests.find(created.contentHash);
+						auto& map = conversionRequests.contentHashToCanonical;
+						auto found = map.find(created.contentHash);
 						// may not find things because of unconverted dummy deps
-						if (found!=conversionRequests.end())
+						if (found!=map.end())
 							retval[found->second.firstCopyIx++] = instance.uniqueCopyGroupID;
 						else
 						{
@@ -2673,7 +2725,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				return retval;
 			}();
 
-			core::vector<asset_cached_t<AssetType>> gpuObjects(gpuObjUniqueCopyGroupIDs.size());
+			//
+			conversionRequests.gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size());
+			//
 			auto assign = [&]<bool GPUObjectWhollyImmutable=false>(
 				const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj, const AssetType* asset=nullptr
 			)->asset_traits<AssetType>::video_t*
@@ -2694,7 +2748,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					);
 					return nullptr;
 				}
-				auto output = gpuObjects.data()+copyIx+baseIx;
+				auto output = conversionRequests.gpuObjects.data()+copyIx+baseIx;
 				const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx];
 				if constexpr (std::is_same_v<AssetType,ICPUBuffer> || std::is_same_v<AssetType,ICPUImage>)
 				{
@@ -2726,13 +2780,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			// Dispatch to correct creation of GPU objects
 			if constexpr (std::is_same_v<AssetType,ICPUSampler>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 					assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					const ICPUBuffer* asset = entry.second.canonicalAsset;
@@ -2759,8 +2813,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT);
 				
 				constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
-				accelerationStructureParams[IsTLAS].resize(gpuObjects.size());
-				for (auto& entry : conversionRequests)
+				accelerationStructureParams[IsTLAS].resize(conversionRequests.gpuObjects.size());
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					const auto* as = entry.second.canonicalAsset;
@@ -2907,7 +2961,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUImage>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					const ICPUImage* asset = entry.second.canonicalAsset;
@@ -2987,7 +3041,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUBufferView* asset = entry.second.canonicalAsset;
 					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
@@ -3009,7 +3063,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUImageView>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUImageView* asset = entry.second.canonicalAsset;
 					const auto& cpuParams = asset->getCreationParameters();
@@ -3057,7 +3111,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					.readCache = inputs.readShaderCache,
 					.writeCache = inputs.writeShaderCache
 				};
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					createParams.cpushader = entry.second.canonicalAsset;
@@ -3066,7 +3120,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUDescriptorSetLayout>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUDescriptorSetLayout* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3135,7 +3189,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			{
 				core::vector<asset::SPushConstantRange> pcRanges;
 				pcRanges.reserve(CSPIRVIntrospector::MaxPushConstantsSize);
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUPipelineLayout* asset = entry.second.canonicalAsset;
 					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
@@ -3185,7 +3239,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUPipelineCache>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUPipelineCache* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3199,7 +3253,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUComputePipeline>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUComputePipeline* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3230,7 +3284,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPURenderpass>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPURenderpass* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3246,7 +3300,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			{
 				core::vector<IGPUShader::SSpecInfo> tmpSpecInfo;
 				tmpSpecInfo.reserve(5);
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUGraphicsPipeline* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3294,7 +3348,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				// Descriptor Pools have large up-front slots reserved for all descriptor types, if we were to merge 
 				// multiple descriptor sets to be allocated from one pool, dropping any set wouldn't result in the
 				// reclamation of the memory used, it would at most (with the FREE pool create flag) return to pool. 
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUDescriptorSet* asset = entry.second.canonicalAsset;
 					for (auto i=0ull; i<entry.second.copyCount; i++)
@@ -3334,54 +3388,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				}
 			}
 
-			// Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same content hash and GPU object)
 			// This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures
-			if constexpr (!std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> && !std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
-				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-					{
-						auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
-						// already found in read cache and not converted
-						if (created.gpuObj)
-							return;
-
-						const auto& contentHash = created.contentHash;
-						auto found = conversionRequests.find(contentHash);
-
-						const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
-
-						const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-						// can happen if deps were unconverted dummies
-						if (found==conversionRequests.end())
-						{
-							if (contentHash!=CHashCache::NoContentHash)
-								inputs.logger.log(
-									"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
-									system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-								);
-							return;
-						}
-						// unhashables were not supposed to be added to conversion requests
-						assert(contentHash!=CHashCache::NoContentHash);
-
-						const auto copyIx = found->second.firstCopyIx++;
-						// the counting sort was stable
-						assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]);
-
-						auto& gpuObj = gpuObjects[copyIx];
-						if (!gpuObj)
-						{
-							inputs.logger.log(
-								"Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
-								system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset
-							);
-							return;
-						}
-						// insert into staging cache
-						stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
-						// propagate back to dfsCache
-						created.gpuObj = std::move(gpuObj);
-					}
-				);
+			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType>)
+			{
+				propagateToStagingCache.template operator()<AssetType>(conversionRequests);
+				return {};
+			}
+			return conversionRequests;
 		};
 		// The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants.
 		// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.

From bbce9f51a4a8fc5401a4ce0317eb8eba7b854460 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 9 May 2025 15:13:08 +0200
Subject: [PATCH 056/346] refactor the conversion request system

---
 src/nbl/video/utilities/CAssetConverter.cpp | 659 ++++++++++----------
 1 file changed, 335 insertions(+), 324 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index b87cbbfdde..0dc431f8ae 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2025,6 +2025,7 @@ class MetaDeviceMemoryAllocator final
 			if ((memReqs.memoryTypeBits&memoryTypeConstraint)==0)
 			{
 				m_logger.log("Overconstrained the Memory Type Index bitmask %d with %d for %s",system::ILogger::ELL_ERROR,memReqs.memoryTypeBits,memoryTypeConstraint,gpuObj->getObjectDebugName());
+				pGpuObj->value = nullptr;
 				return false;
 			}
 			//
@@ -2044,6 +2045,7 @@ class MetaDeviceMemoryAllocator final
 				if (!allocation.isValid())
 				{
 					m_logger.log("Failed to allocate and bind dedicated memory for %s",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName());
+					pGpuObj->value = nullptr;
 					return false;
 				}
 			}
@@ -2307,8 +2309,210 @@ struct unique_conversion_t
 template<asset::Asset AssetType>
 struct conversions_t
 {
-	core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>> contentHashToCanonical;
-	core::vector<asset_cached_t<AssetType>> gpuObjects;
+	public:
+		// Go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions.
+		void gather(core::tuple_transform_t<dfs_cache,CAssetConverter::supported_asset_types>& dfsCaches, CAssetConverter::CHashCache* hashCache, const CAssetConverter::CCache<AssetType>* readCache)
+		{
+			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
+			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+				{
+					// compute the hash or look it up if it exists
+					// We mistrust every dependency such that the eject/update if needed.
+					// Its really important that the Deduplication gets performed Bottom-Up
+					auto& contentHash = created.contentHash;
+					PatchOverride patchOverride(*inputs,dfsCaches,instance.uniqueCopyGroupID);
+					contentHash = hashCache->hash<AssetType>(
+						{instance.asset,&created.patch},
+						&patchOverride,
+						/*.mistrustLevel =*/ 1
+					);
+					// failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch)
+					if (contentHash==CAssetConverter::CHashCache::NoContentHash)
+					{
+						inputs->logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID);
+						return;
+					}
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+					{
+						inputs->logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]);
+					}
+					// if we have a read cache, lets retry looking the item up!
+					if (readCache)
+					{
+						// We can't look up "near misses" (supersets of patches) because they'd have different hashes
+						// and we can't afford to split hairs like finding overlapping buffer ranges, etc.
+						// Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries).
+						const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID});
+						if (found!=readCache->forwardMapEnd())
+						{
+							created.gpuObj = found->second;
+							inputs->logger.log(
+								"Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG,
+								instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+							);
+							return;
+						}
+					}
+					// The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content
+					// SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here!
+					// Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant`
+					// An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request.
+					auto* isPrehashed = dynamic_cast<const IPreHashed*>(instance.asset);
+					if (isPrehashed && isPrehashed->missingContent())
+					{
+						inputs->logger.log(
+							"PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR,
+							instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+						);
+						return;
+					}
+					// then de-duplicate the conversions needed
+					const patch_index_t patchIx = {static_cast<uint64_t>(std::distance(dfsCache.nodes.data(),&created))};
+					auto [inSetIt,inserted] = contentHashToCanonical.emplace(contentHash,unique_conversion_t<AssetType>{.canonicalAsset=instance.asset,.patchIndex=patchIx});
+					if (!inserted)
+					{
+						// If an element prevented insertion, the patch must be identical!
+						// Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory.
+						assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch);
+						inSetIt->second.copyCount++;
+					}
+				}
+			);
+			
+			// work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort
+			{
+				// assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order
+				auto exclScanConvReqs = [&]()->size_t
+				{
+					size_t sum = 0;
+					for (auto& entry : contentHashToCanonical)
+					{
+						entry.second.firstCopyIx = sum;
+						sum += entry.second.copyCount;
+					}
+					return sum;
+				};
+				gpuObjUniqueCopyGroupIDs.resize(exclScanConvReqs());
+				//
+				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+					{
+						if (created.gpuObj)
+							return;
+						auto found = contentHashToCanonical.find(created.contentHash);
+						// may not find things because of unconverted dummy deps
+						if (found!=contentHashToCanonical.end())
+							gpuObjUniqueCopyGroupIDs[found->second.firstCopyIx++] = instance.uniqueCopyGroupID;
+						else
+						{
+							inputs->logger.log(
+								"No conversion request made for Asset %p in group %d, its impossible to convert.",
+								system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID
+							);
+						}
+					}
+				);
+				// `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form
+				exclScanConvReqs();
+			}
+
+			// we now know the size of out output array
+			gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size());
+		}
+
+		//
+		template<bool GPUObjectWhollyImmutable=false>
+		void assign(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj, const AssetType* asset=nullptr)
+		{
+			const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+			if constexpr (GPUObjectWhollyImmutable) // including any deps!
+			if (copyIx==1) // Only warn once to reduce log spam
+				inputs->logger.log(
+					"Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?",
+					system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+				);
+			//
+			if (!gpuObj)
+			{
+				inputs->logger.log(
+					"Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx",
+					system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+				);
+				return;
+			}
+			auto output = gpuObjects.data()+copyIx+baseIx;
+			output->value = std::move(gpuObj);
+			const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx];
+			if constexpr (std::is_same_v<AssetType,ICPUBuffer> || std::is_same_v<AssetType,ICPUImage>)
+			{
+				const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get());
+				if (!deferredAllocator->request(output,constrainMask))
+					return;
+			}
+			// set debug names on everything!
+			{
+				std::ostringstream debugName;
+				debugName << "Created by Converter ";
+				debugName << std::hex;
+				debugName << this;
+				debugName << " from Asset with hash ";
+				for (const auto& byte : contentHash.data)
+					debugName << uint32_t(byte) << " ";
+				debugName << "for Group " << uniqueCopyGroupID;
+				output->get()->setObjectDebugName(debugName.str().c_str());
+			}
+		}
+
+		// Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object)
+		void propagateToCaches(dfs_cache<AssetType>& dfsCache, CAssetConverter::SReserveResult::staging_cache_t<AssetType>& stagingCache)
+		{
+			assert(gpuObjUniqueCopyGroupIDs.empty());
+			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+				{
+					// already found in read cache and not converted
+					if (created.gpuObj)
+						return;
+
+					const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
+					const auto& contentHash = created.contentHash;
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+
+					auto found = contentHashToCanonical.find(contentHash);
+					// can happen if deps were unconverted dummies
+					if (found==contentHashToCanonical.end())
+					{
+						if (contentHash!=CAssetConverter::CHashCache::NoContentHash)
+							inputs->logger.log(
+								"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
+								system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3]
+							);
+						return;
+					}
+					// unhashables were not supposed to be added to conversion requests
+					assert(contentHash!=CAssetConverter::CHashCache::NoContentHash);
+
+					const auto copyIx = found->second.firstCopyIx++;
+					auto& gpuObj = gpuObjects[copyIx];
+					if (!gpuObj)
+					{
+						inputs->logger.log(
+							"Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
+							system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset
+						);
+						return;
+					}
+					// insert into staging cache
+					stagingCache.emplace(gpuObj.get(),typename CAssetConverter::CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
+					// propagate back to dfsCache
+					created.gpuObj = std::move(gpuObj);
+				}
+			);
+		}
+
+		const CAssetConverter::SInputs* inputs;
+		MetaDeviceMemoryAllocator* deferredAllocator;
+		core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>> contentHashToCanonical;
+		core::vector<size_t> gpuObjUniqueCopyGroupIDs;
+		core::vector<asset_cached_t<AssetType>> gpuObjects;
 };
 
 //
@@ -2548,53 +2752,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// can now spawn our own hash cache
 		retval.m_hashCache = core::make_smart_refctd_ptr<CHashCache>();
 
-		// Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object)
-		auto propagateToStagingCache = [&inputs,&dfsCaches,&retval]<Asset AssetType>(conversions_t<AssetType>& conversionRequests)->void
-		{
-			std::get<dfs_cache<AssetType>>(dfsCaches).for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-				{
-					auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
-					// already found in read cache and not converted
-					if (created.gpuObj)
-						return;
-
-					const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
-					const auto& contentHash = created.contentHash;
-					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-
-					auto& map = conversionRequests.contentHashToCanonical;
-					auto found = map.find(contentHash);
-					// can happen if deps were unconverted dummies
-					if (found==map.end())
-					{
-						if (contentHash!=CHashCache::NoContentHash)
-							inputs.logger.log(
-								"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
-								system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3]
-							);
-						return;
-					}
-					// unhashables were not supposed to be added to conversion requests
-					assert(contentHash!=CHashCache::NoContentHash);
-
-					const auto copyIx = found->second.firstCopyIx++;
-					auto& gpuObj = conversionRequests.gpuObjects[copyIx];
-					if (!gpuObj)
-					{
-						inputs.logger.log(
-							"Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
-							system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset
-						);
-						return;
-					}
-					// insert into staging cache
-					stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
-					// propagate back to dfsCache
-					created.gpuObj = std::move(gpuObj);
-				}
-			);
-		};
-
 		MetaDeviceMemoryAllocator deferredAllocator(inputs.allocator ? inputs.allocator:device,inputs.logger);
 
 		// BLAS and TLAS creation is somewhat delayed by buffer creation and allocation
@@ -2613,176 +2770,27 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// Deduplication, Creation and Propagation
 		auto dedupCreateProp = [&]<Asset AssetType>()->conversions_t<AssetType>
 		{
-			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
 			// This map contains the assets by-hash, identical asset+patch hash the same.
 			// It only has entries for GPU objects that need to be created
-			conversions_t<AssetType> conversionRequests;
+			conversions_t<AssetType> conversionRequests = {&inputs,&deferredAllocator};
 
-			// We now go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions.
+			//
 			const CCache<AssetType>* readCache = inputs.readCache ? (&std::get<CCache<AssetType>>(inputs.readCache->m_caches)):nullptr;
-			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-				{
-					// compute the hash or look it up if it exists
-					// We mistrust every dependency such that the eject/update if needed.
-					// Its really important that the Deduplication gets performed Bottom-Up
-					auto& contentHash = created.contentHash;
-					PatchOverride patchOverride(inputs,dfsCaches,instance.uniqueCopyGroupID);
-					contentHash = retval.getHashCache()->hash<AssetType>(
-						{instance.asset,&created.patch},
-						&patchOverride,
-						/*.mistrustLevel =*/ 1
-					);
-					// failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch)
-					if (contentHash==CHashCache::NoContentHash)
-					{
-						inputs.logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID);
-						return;
-					}
-					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-					{
-						inputs.logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]);
-					}
-					// if we have a read cache, lets retry looking the item up!
-					if (readCache)
-					{
-						// We can't look up "near misses" (supersets of patches) because they'd have different hashes
-						// and we can't afford to split hairs like finding overlapping buffer ranges, etc.
-						// Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries).
-						const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID});
-						if (found!=readCache->forwardMapEnd())
-						{
-							created.gpuObj = found->second;
-							inputs.logger.log(
-								"Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG,
-								instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-							);
-							return;
-						}
-					}
-					// The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content
-					// SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here!
-					// Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant`
-					// An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request.
-					auto* isPrehashed = dynamic_cast<const IPreHashed*>(instance.asset);
-					if (isPrehashed && isPrehashed->missingContent())
-					{
-						inputs.logger.log(
-							"PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR,
-							instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-						);
-						return;
-					}
-					// then de-duplicate the conversions needed
-					const patch_index_t patchIx = {static_cast<uint64_t>(std::distance(dfsCache.nodes.data(),&created))};
-					auto [inSetIt,inserted] = conversionRequests.contentHashToCanonical.emplace(contentHash,unique_conversion_t<AssetType>{.canonicalAsset=instance.asset,.patchIndex=patchIx});
-					if (!inserted)
-					{
-						// If an element prevented insertion, the patch must be identical!
-						// Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory.
-						assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch);
-						inSetIt->second.copyCount++;
-					}
-				}
-			);
+			conversionRequests.gather(dfsCaches,retval.m_hashCache.get(),readCache);
 			
-			// work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort
-			const auto gpuObjUniqueCopyGroupIDs = [&]()->core::vector<size_t>
-			{
-				core::vector<size_t> retval;
-				// now assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order
-				auto exclScanConvReqs = [&]()->size_t
-				{
-					size_t sum = 0;
-					for (auto& entry : conversionRequests.contentHashToCanonical)
-					{
-						entry.second.firstCopyIx = sum;
-						sum += entry.second.copyCount;
-					}
-					return sum;
-				};
-				retval.resize(exclScanConvReqs());
-				//
-				dfsCache.for_each([&inputs,&retval,&conversionRequests](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-					{
-						if (created.gpuObj)
-							return;
-						auto& map = conversionRequests.contentHashToCanonical;
-						auto found = map.find(created.contentHash);
-						// may not find things because of unconverted dummy deps
-						if (found!=map.end())
-							retval[found->second.firstCopyIx++] = instance.uniqueCopyGroupID;
-						else
-						{
-							inputs.logger.log(
-								"No conversion request made for Asset %p in group %d, its impossible to convert.",
-								system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID
-							);
-						}
-					}
-				);
-				// `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form
-				exclScanConvReqs();
-				return retval;
-			}();
-
 			//
-			conversionRequests.gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size());
-			//
-			auto assign = [&]<bool GPUObjectWhollyImmutable=false>(
-				const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj, const AssetType* asset=nullptr
-			)->asset_traits<AssetType>::video_t*
-			{
-				const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-				if constexpr (GPUObjectWhollyImmutable) // including any deps!
-				if (copyIx==1) // Only warn once to reduce log spam
-					inputs.logger.log(
-						"Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?",
-						system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-					);
-				//
-				if (!gpuObj)
-				{
-					inputs.logger.log(
-						"Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx",
-						system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-					);
-					return nullptr;
-				}
-				auto output = conversionRequests.gpuObjects.data()+copyIx+baseIx;
-				const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx];
-				if constexpr (std::is_same_v<AssetType,ICPUBuffer> || std::is_same_v<AssetType,ICPUImage>)
-				{
-					const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get());
-					if (!deferredAllocator.request(output,constrainMask))
-						return nullptr;
-				}
-				// set debug names on everything!
-				{
-					std::ostringstream debugName;
-					debugName << "Created by Converter ";
-					debugName << std::hex;
-					debugName << this;
-					debugName << " from Asset with hash ";
-					for (const auto& byte : contentHash.data)
-						debugName << uint32_t(byte) << " ";
-					debugName << "for Group " << uniqueCopyGroupID;
-					gpuObj.get()->setObjectDebugName(debugName.str().c_str());
-				}
-				output->value = std::move(gpuObj);
-				return output->value.get();
-			};
-
 			GetDependantVisitBase<AssetType> visitBase = {
 				.inputs = inputs,
 				.dfsCaches = dfsCaches
 			};
 
 			// Dispatch to correct creation of GPU objects
+			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
 			if constexpr (std::is_same_v<AssetType,ICPUSampler>)
 			{
 				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
-					assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
+					conversionRequests.template assign<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
 			{
@@ -2797,13 +2805,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					params.usage = patch.usage;
 					// concurrent ownership if any
 					const auto outIx = i+entry.second.firstCopyIx;
-					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch);
 					params.queueFamilyIndexCount = queueFamilies.size();
 					params.queueFamilyIndices = queueFamilies.data();
 					// if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion
-					if (IGPUBuffer* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset); gpuObj)
-						retval.m_bufferConversions.push_back({core::smart_refctd_ptr<const ICPUBuffer>(asset),gpuObj});
+					conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset);
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> || std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
@@ -2822,7 +2829,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					const bool motionBlur = patch.isMotion;
 					const auto buildFlags = patch.getBuildFlags(as);
 					const auto outIx = i+entry.second.firstCopyIx;
-					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 					// prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
 					const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize;
 					uint64_t buildSize = 0; uint32_t buildAlignment = 4;
@@ -2946,10 +2953,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						out.storage.value = device->createBuffer(std::move(params));
 						if (out.storage)
 						if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
-						{
-							out.storage.value = nullptr;
 							continue;
-						}
 					}
 					out.scratchSize = sizes.buildScratchSize;
 					out.motionBlur = motionBlur;
@@ -3027,16 +3031,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					}
 					// concurrent ownership if any
 					const auto outIx = i+entry.second.firstCopyIx;
-					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch);
 					params.queueFamilyIndexCount = queueFamilies.size();
 					params.queueFamilyIndices = queueFamilies.data();
 					// gpu image specifics
 					params.tiling = static_cast<IGPUImage::TILING>(patch.linearTiling);
 					params.preinitialized = false;
-					// if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion (if we have content to upload)
-					if (IGPUImage* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset); gpuObj && !asset->getRegions().empty())
-						retval.m_imageConversions.push_back({{core::smart_refctd_ptr<const ICPUImage>(asset),gpuObj},bool(patch.recomputeMips)});
+					// if creation successful, we will request some memory allocation to bind to
+					conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset);
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
@@ -3048,7 +3051,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUBufferView>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3057,7 +3060,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						if (!visitor())
 							continue;
 						// no format promotion for buffer views
-						assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat()));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat()));
 					}
 				}
 			}
@@ -3071,7 +3074,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUImageView>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3100,7 +3103,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						// if underlying image had mip-chain extended then we extend our own
 						if (imageParams.mipLevels!=visitor.oldMipCount)
 							params.subresourceRange.levelCount = imageParams.mipLevels-params.subresourceRange.baseMipLevel;
-						assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params)));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params)));
 					}
 				}
 			}
@@ -3115,7 +3118,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					createParams.cpushader = entry.second.canonicalAsset;
-					assign(entry.first,entry.second.firstCopyIx,i,device->createShader(createParams));
+					conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createShader(createParams));
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUDescriptorSetLayout>)
@@ -3169,7 +3172,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						// visit the immutables, can't be factored out because depending on groupID the dependant might change
 						AssetVisitor<GetDependantVisit<ICPUDescriptorSetLayout>> visitor = {
 							{
@@ -3181,7 +3184,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						};
 						if (!visitor())
 							continue;
-						assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings));
 					}
 				}
 			}
@@ -3224,7 +3227,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUPipelineLayout>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3233,7 +3236,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						if (!visitor())
 							continue;
 						auto layout = device->createPipelineLayout(pcRanges,std::move(visitor.dsLayouts[0]),std::move(visitor.dsLayouts[1]),std::move(visitor.dsLayouts[2]),std::move(visitor.dsLayouts[3]));
-						assign(entry.first,entry.second.firstCopyIx,i,std::move(layout));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(layout));
 					}
 				}
 			}
@@ -3247,7 +3250,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						// since we don't have dependants we don't care about our group ID
 						// we create threadsafe pipeline caches, because we have no idea how they may be used
-						assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false));
+						conversionRequests.template assign<true>(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false));
 					}
 				}
 			}
@@ -3260,7 +3263,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUComputePipeline>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3278,7 +3281,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							params.shader = visitor.getSpecInfo(IShader::E_SHADER_STAGE::ESS_COMPUTE);
 							device->createComputePipelines(inputs.pipelineCache,{&params,1},&ppln);
 						}
-						assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
 					}
 				}
 			}
@@ -3292,7 +3295,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						// since we don't have dependants we don't care about our group ID
 						// we create threadsafe pipeline caches, because we have no idea how they may be used
-						assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters()));
+						conversionRequests.template assign<true>(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters()));
 					}
 				}
 			}
@@ -3307,7 +3310,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUGraphicsPipeline>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3337,7 +3340,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							}
 							params.cached = asset->getCachedCreationParams();
 							device->createGraphicsPipelines(inputs.pipelineCache,{&params,1},&ppln);
-							assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
+							conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
 						}
 					}
 				}
@@ -3354,7 +3357,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUDescriptorSet>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3383,127 +3386,135 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						}
 						else
 							inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName());
-						assign(entry.first,entry.second.firstCopyIx,i,std::move(ds));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ds));
 					}
 				}
 			}
 
-			// This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures
-			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType>)
+			// clear what we don't need
+			conversionRequests.gpuObjUniqueCopyGroupIDs.clear();
+			// This gets deferred till AFTER the Buffer Memory Allocations and Binding
+			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType> && !std::is_base_of_v<IDeviceMemoryBacked,typename asset_traits<AssetType>::video_t>)
 			{
-				propagateToStagingCache.template operator()<AssetType>(conversionRequests);
+				conversionRequests.propagateToCaches(std::get<dfs_cache<AssetType>>(dfsCaches),std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches));
 				return {};
 			}
 			return conversionRequests;
 		};
-		// The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants.
-		// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.
-		// If two Asset chains are independent then we order them from most catastrophic failure to least.
-		dedupCreateProp.template operator()<ICPUBuffer>();
-		dedupCreateProp.template operator()<ICPUBottomLevelAccelerationStructure>();
-		dedupCreateProp.template operator()<ICPUTopLevelAccelerationStructure>();
-		dedupCreateProp.template operator()<ICPUImage>();
-		// now allocate the memory for buffers and images
-		deferredAllocator.finalize();
-
-		// find out which buffers need to be uploaded via a staging buffer
-		std::erase_if(retval.m_bufferConversions,[&](const SReserveResult::SConvReqBuffer& conv)->bool
+		// scope so the conversion requests go our of scope early
+		{
+			// The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants.
+			// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.
+			// If two Asset chains are independent then we order them from most catastrophic failure to least.
+			auto bufferConversions = dedupCreateProp.template operator()<ICPUBuffer>();
+			auto blasConversions = dedupCreateProp.template operator()<ICPUBottomLevelAccelerationStructure>();
+			auto tlasConversions = dedupCreateProp.template operator()<ICPUTopLevelAccelerationStructure>();
+			auto imageConversions = dedupCreateProp.template operator()<ICPUImage>();
+			// now allocate the memory for buffers and images
+			deferredAllocator.finalize();
+
+			// find out which buffers need to be uploaded via a staging buffer
+			for (auto& entry : bufferConversions.contentHashToCanonical)
+			for (auto i=0ull; i<entry.second.copyCount; i++)
+			if (auto& gpuBuff=bufferConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuBuff)
 			{
-				assert(conv.gpuObj);
-				const auto boundMemory = conv.gpuObj->getBoundMemory();
-				if (!boundMemory.isValid())
-					return true;
-				if (!canHostWriteToMemoryRange(boundMemory,conv.gpuObj->getSize()))
+				const auto boundMemory = gpuBuff->getBoundMemory();
+				assert(boundMemory.isValid());
+				if (!canHostWriteToMemoryRange(boundMemory,gpuBuff->getSize()))
 					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-				return false;
+				retval.m_bufferConversions.push_back({core::smart_refctd_ptr<const ICPUBuffer>(entry.second.canonicalAsset),gpuBuff.get()});
 			}
-		);
-		// Deal with Deferred Creation of Acceleration structures
-		{
-			const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
-			auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
+			bufferConversions.propagateToCaches(std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBuffer>>(retval.m_stagingCaches));
+			// Deal with Deferred Creation of Acceleration structures
 			{
-				constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
-				// TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
-				size_t scratchSizeFullParallelBuild[2] = {0,0};
-				//
-				core::vector<SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
-				if constexpr (IsTLAS)
-					pConversions = retval.m_tlasConversions;
-				else
-					pConversions = retval.m_blasConversions;
-				// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
-				for (const auto& deferredParams : accelerationStructureParams[IsTLAS])
+				const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
+				auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
 				{
-					// buffer failed to create/allocate
-					if (!deferredParams.storage)
-						continue;
-					const auto bufSz = deferredParams.storage.get()->getSize();
-					IGPUAccelerationStructure::SCreationParams baseParams;
-					{
-						using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
-						baseParams = {
-							.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
-							.flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE
-						};
-					}
+					constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
+					// TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
+					size_t scratchSizeFullParallelBuild[2] = {0,0};
 					//
-					auto& request = pConversions[deferredParams.hostBuild].emplace_back();
-					request.canonical = smart_refctd_ptr<const AccelerationStructure>(static_cast<const AccelerationStructure*>(deferredParams.canonical));
-					smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
+					core::vector<SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
 					if constexpr (IsTLAS)
-					{
-						// is there any reason for it to be more?
-						const uint32_t maxInstances = request.canonical->getInstances().size();
-						as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances});
-					}
+						pConversions = retval.m_tlasConversions;
 					else
-						as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
-					request.gpuObj = as.get();
-					if (!request.gpuObj)
+						pConversions = retval.m_blasConversions;
+					// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
+					for (const auto& deferredParams : accelerationStructureParams[IsTLAS])
 					{
-						inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
-						continue;
+						// buffer failed to create/allocate
+						if (!deferredParams.storage)
+							continue;
+						const auto bufSz = deferredParams.storage.get()->getSize();
+						IGPUAccelerationStructure::SCreationParams baseParams;
+						{
+							using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
+							baseParams = {
+								.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
+								.flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE
+							};
+						}
+						//
+						auto& request = pConversions[deferredParams.hostBuild].emplace_back();
+						request.canonical = smart_refctd_ptr<const AccelerationStructure>(static_cast<const AccelerationStructure*>(deferredParams.canonical));
+						smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
+						if constexpr (IsTLAS)
+						{
+							// is there any reason for it to be more?
+							const uint32_t maxInstances = request.canonical->getInstances().size();
+							as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances});
+						}
+						else
+							as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
+						request.gpuObj = as.get();
+						if (!request.gpuObj)
+						{
+							inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
+							continue;
+						}
+						request.scratchSize = deferredParams.scratchSize;
+						request.compact = deferredParams.compactAfterBuild;
+						request.buildFlags = deferredParams.buildFlags;
+						// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
+						retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize);
+						scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize;
+						// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
+						if (deferredParams.compactAfterBuild)
+							retval.m_compactedASMaxMemory += bufSz;
 					}
-					request.scratchSize = deferredParams.scratchSize;
-					request.compact = deferredParams.compactAfterBuild;
-					request.buildFlags = deferredParams.buildFlags;
-					// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
-					retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize);
-					scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize;
-					// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
-					if (deferredParams.compactAfterBuild)
-						retval.m_compactedASMaxMemory += bufSz;
-				}
-				retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
-				retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
-			};
-			createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
-			createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
-			//
-			if (retval.willDeviceASBuild() || retval.willCompactAS())
-				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-		}
-		// find out which images need what caps for the transfer and mipmapping
-		std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool
+					retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
+					retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
+				};
+				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
+				blasConversions.propagateToCaches(std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches));
+// TODO: don't build BLASes which aren't roots or use by any TLAS
+				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
+				tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
+				//
+				if (retval.willDeviceASBuild() || retval.willCompactAS())
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+			}
+			// find out which images need what caps for the transfer and mipmapping
+			auto& dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
+			for (auto& entry : imageConversions.contentHashToCanonical)
+			for (auto i=0ull; i<entry.second.copyCount; i++)
+			if (auto& gpuImg=imageConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuImg && !entry.second.canonicalAsset->getRegions().empty())
 			{
-				assert(conv.gpuObj);
-				const auto boundMemory = conv.gpuObj->getBoundMemory();
-				if (!boundMemory.isValid())
-					return true;
+				const auto boundMemory = gpuImg->getBoundMemory();
+				assert(boundMemory.isValid());
 				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-				if (conv.recomputeMips)
+				const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips;
+				if (recomputeMips)
 					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 				// Best effort guess, without actually looking at all regions
-				const auto& params = conv.gpuObj->getCreationParameters();
+				const auto& params = gpuImg->getCreationParameters();
 				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
 				if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
 					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
-				return false;
+				retval.m_imageConversions.push_back({{core::smart_refctd_ptr<const ICPUImage>(entry.second.canonicalAsset),gpuImg.get()},recomputeMips});
 			}
-		);
-
-
+			imageConversions.propagateToCaches(dfsCacheImages,std::get<SReserveResult::staging_cache_t<ICPUImage>>(retval.m_stagingCaches));
+		}
 		dedupCreateProp.template operator()<ICPUBufferView>();
 		dedupCreateProp.template operator()<ICPUImageView>();
 		dedupCreateProp.template operator()<ICPUShader>();

From e0fe7ed43fea13f2db838f94d276f5bb45794b9c Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 9 May 2025 17:23:02 +0200
Subject: [PATCH 057/346] Start work on fixing orphan GPU objects due to parent
 failures in `CAssetConverter::reserve`

TODO: make the staging cache refcounted with heterogenous (non-refcounted) lookup
---
 include/nbl/video/utilities/CAssetConverter.h |   3 +-
 src/nbl/video/utilities/CAssetConverter.cpp   | 277 +++++++++++-------
 2 files changed, 170 insertions(+), 110 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 02cc9ab447..9175f20a86 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1083,6 +1083,7 @@ class CAssetConverter : public core::IReferenceCounted
 				
 				// we don't insert into the writeCache until conversions are successful
 				core::tuple_transform_t<staging_cache_t,supported_asset_types> m_stagingCaches;
+
 				// need a more explicit list of GPU objects that need device-assisted conversion
 				template<asset::Asset AssetType>
 				struct SConversionRequestBase
@@ -1141,7 +1142,7 @@ class CAssetConverter : public core::IReferenceCounted
 					IGPUDescriptorSet* dstSet;
 					uint32_t binding;
 					uint32_t arrayElement;
-					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
+					const IGPUTopLevelAccelerationStructure* tlas;
 				};
 				struct SDeferredTLASWriteHasher
 				{
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 0dc431f8ae..32f9408365 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1655,10 +1655,8 @@ template<>
 class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
 {
 	public:
-		// all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit
-		uint64_t buildInputSize = 0;
-		//
-		CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
+		// TODO: deal with usages not going through because of cancelled TLAS builds, by gathering in a top-down pass at the end of `reserve`
+		CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap = nullptr;
 
 	protected:
 		bool descend_impl(
@@ -1670,15 +1668,16 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 			auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			const auto instances = user.asset->getInstances();
-			assert(instanceIndex<instances.size());
-			buildInputSize += ITopLevelAccelerationStructure::getInstanceSize(instances[instanceIndex].getType());
-			// TODO: deal with usages not going through because of cancelled TLAS builds, find out which BLASes were meant to be built
-			auto foundBLAS = blasBuildMap->find(dep.asset);
-			if (foundBLAS!=blasBuildMap->end())
-				foundBLAS->second.remainingUsages++;
-			else
-				blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}});
+			if (blasBuildMap)
+			{
+				const auto instances = user.asset->getInstances();
+				assert(instanceIndex<instances.size());
+				auto foundBLAS = blasBuildMap->find(dep.asset);
+				if (foundBLAS!=blasBuildMap->end())
+					foundBLAS->second.remainingUsages++;
+				else
+					blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}});
+			}
 			return true;
 		}
 };
@@ -1961,7 +1960,7 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
 			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
 			{
-				deferredTLASWrites.push_back({nullptr,binding.data,element,depObj});
+				deferredTLASWrites.push_back({nullptr,binding.data,element,depObj.get()});
 				return true;
 			}
 			//
@@ -2305,6 +2304,20 @@ struct unique_conversion_t
 	size_t copyCount : 24 = 1u;
 };
 
+//
+inline void setDebugName(const CAssetConverter* conv, IBackendObject* gpuObj, const core::blake3_hash_t& contentHash, const uint64_t uniqueCopyGroupID)
+{
+	std::ostringstream debugName;
+	debugName << "Created by Converter ";
+	debugName << std::hex;
+	debugName << conv;
+	debugName << " from Asset with hash ";
+	for (const auto& byte : contentHash.data)
+		debugName << uint32_t(byte) << " ";
+	debugName << "for Group " << uniqueCopyGroupID;
+	gpuObj->setObjectDebugName(debugName.str().c_str());
+}
+
 // Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs
 template<asset::Asset AssetType>
 struct conversions_t
@@ -2449,17 +2462,7 @@ struct conversions_t
 					return;
 			}
 			// set debug names on everything!
-			{
-				std::ostringstream debugName;
-				debugName << "Created by Converter ";
-				debugName << std::hex;
-				debugName << this;
-				debugName << " from Asset with hash ";
-				for (const auto& byte : contentHash.data)
-					debugName << uint32_t(byte) << " ";
-				debugName << "for Group " << uniqueCopyGroupID;
-				output->get()->setObjectDebugName(debugName.str().c_str());
-			}
+			setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID);
 		}
 
 		// Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object)
@@ -2508,6 +2511,7 @@ struct conversions_t
 			);
 		}
 
+		const CAssetConverter* conv;
 		const CAssetConverter::SInputs* inputs;
 		MetaDeviceMemoryAllocator* deferredAllocator;
 		core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>> contentHashToCanonical;
@@ -2759,11 +2763,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		{
 			const IAccelerationStructure* canonical;
 			asset_cached_t<ICPUBuffer> storage = {};
-			uint64_t scratchSize : 45 = 0;
-			uint64_t motionBlur : 1 = false;
-			uint64_t buildFlags : 16 = 0;
-			uint64_t hostBuild : 1 = false;
-			uint64_t compactAfterBuild : 1 = false;
+			uint64_t patchIx = 0;
+			uint64_t uniqueCopyGroupID = 0;
+			uint64_t scratchSize = 0;
 			uint64_t buildSize = 0;
 		};
 		core::vector<DeferredASCreationParams> accelerationStructureParams[2];
@@ -2772,7 +2774,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		{
 			// This map contains the assets by-hash, identical asset+patch hash the same.
 			// It only has entries for GPU objects that need to be created
-			conversions_t<AssetType> conversionRequests = {&inputs,&deferredAllocator};
+			conversions_t<AssetType> conversionRequests = {this,&inputs,&deferredAllocator};
 
 			//
 			const CCache<AssetType>* readCache = inputs.readCache ? (&std::get<CCache<AssetType>>(inputs.readCache->m_caches)):nullptr;
@@ -2825,7 +2827,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					const auto* as = entry.second.canonicalAsset;
-					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
+					const auto patchIx = entry.second.patchIndex.value;
+					const auto& patch = dfsCache.nodes[patchIx].patch;
 					const bool motionBlur = patch.isMotion;
 					const auto buildFlags = patch.getBuildFlags(as);
 					const auto outIx = i+entry.second.firstCopyIx;
@@ -2843,22 +2846,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						if constexpr (IsTLAS)
 						{
-							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
-								{visitBase},
-								{as,uniqueCopyGroupID},
-								patch
-							};
-							if (!visitor())
-							{
-								inputs.logger.log(
-									"Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx",
-									system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-								);
-								continue;
-							}
+							// TLAS can't check for the BLASes existing yet, because they haven't had their backing buffers allocated yet
 							const auto instanceCount = as->getInstances().size();
 							sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount);
-							incrementBuildSize(visitor.buildInputSize,16);
+							// all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit
+							const uint64_t worstCaseInstanceSize = motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
+							// worst case approximation is fine here
+							incrementBuildSize(worstCaseInstanceSize*instanceCount,16);
 							incrementBuildSize(sizeof(uint64_t)*instanceCount,alignof(uint64_t));
 						}
 						else
@@ -2952,14 +2946,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						params.queueFamilyIndices = queueFamilies.data();
 						out.storage.value = device->createBuffer(std::move(params));
 						if (out.storage)
-						if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
-							continue;
+						{
+							nbl::video::setDebugName(this,out.storage.value.get(),entry.first,uniqueCopyGroupID);
+							if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
+								continue;
+						}
 					}
+					out.patchIx = patchIx;
+					out.uniqueCopyGroupID = uniqueCopyGroupID;
 					out.scratchSize = sizes.buildScratchSize;
-					out.motionBlur = motionBlur;
-					out.buildFlags = static_cast<uint16_t>(buildFlags.value);
-					out.hostBuild = patch.hostBuild;
-					out.compactAfterBuild = patch.compactAfterBuild;
 					out.buildSize = buildSize;
 				}
 			}
@@ -3413,6 +3408,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			// now allocate the memory for buffers and images
 			deferredAllocator.finalize();
 
+			// TODO: everything below is slightly wrong due to not having a final top-down dependency checking pass throwing away useless non-root GPU subtrees
+
 			// find out which buffers need to be uploaded via a staging buffer
 			for (auto& entry : bufferConversions.contentHashToCanonical)
 			for (auto i=0ull; i<entry.second.copyCount; i++)
@@ -3440,46 +3437,62 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					else
 						pConversions = retval.m_blasConversions;
 					// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
-					for (const auto& deferredParams : accelerationStructureParams[IsTLAS])
+					for (size_t i=0; i<accelerationStructureParams[IsTLAS].size(); i++)
+					if (const auto& deferredParams=accelerationStructureParams[IsTLAS][i]; deferredParams.storage)
 					{
-						// buffer failed to create/allocate
-						if (!deferredParams.storage)
-							continue;
+						const auto canonical = static_cast<const AccelerationStructure*>(deferredParams.canonical);
+						const auto& dfsNode = std::get<dfs_cache<AccelerationStructure>>(dfsCaches).nodes[deferredParams.patchIx];
+						const auto& patch = dfsNode.patch;
+						// create the AS
 						const auto bufSz = deferredParams.storage.get()->getSize();
 						IGPUAccelerationStructure::SCreationParams baseParams;
 						{
 							using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
 							baseParams = {
 								.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
-								.flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE
+								.flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE
 							};
 						}
-						//
-						auto& request = pConversions[deferredParams.hostBuild].emplace_back();
-						request.canonical = smart_refctd_ptr<const AccelerationStructure>(static_cast<const AccelerationStructure*>(deferredParams.canonical));
 						smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
 						if constexpr (IsTLAS)
 						{
+							// check if the BLASes we want to use for the instances were successfully allocated and created
+							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
+								{inputs,dfsCaches,&retval.m_blasBuildMap},
+								{canonical,deferredParams.uniqueCopyGroupID},
+								patch
+							};
+							if (!visitor())
+                            {
+                                inputs.logger.log(
+                                    "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx",
+                                    system::ILogger::ELL_ERROR//,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+                                );
+                                continue;
+                            }
 							// is there any reason for it to be more?
-							const uint32_t maxInstances = request.canonical->getInstances().size();
+							const uint32_t maxInstances = canonical->getInstances().size();
 							as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances});
 						}
 						else
 							as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
-						request.gpuObj = as.get();
-						if (!request.gpuObj)
+						if (!as)
 						{
 							inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
 							continue;
 						}
+						// file the request for conversion
+						auto& request = pConversions[patch.hostBuild].emplace_back();
+						request.canonical = smart_refctd_ptr<const AccelerationStructure>(canonical);
+						request.gpuObj = as.get();
 						request.scratchSize = deferredParams.scratchSize;
-						request.compact = deferredParams.compactAfterBuild;
-						request.buildFlags = deferredParams.buildFlags;
+						request.compact = patch.compactAfterBuild;
+						request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
 						// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
-						retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize);
-						scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize;
+						retval.m_minASBuildScratchSize[patch.hostBuild] = core::max(retval.m_minASBuildScratchSize[patch.hostBuild],deferredParams.buildSize);
+						scratchSizeFullParallelBuild[patch.hostBuild] += deferredParams.buildSize;
 						// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
-						if (deferredParams.compactAfterBuild)
+						if (patch.compactAfterBuild)
 							retval.m_compactedASMaxMemory += bufSz;
 					}
 					retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
@@ -3487,7 +3500,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				};
 				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
 				blasConversions.propagateToCaches(std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches));
-// TODO: don't build BLASes which aren't roots or use by any TLAS
 				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
 				tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
 				//
@@ -3566,6 +3578,52 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 	};
 	core::for_each_in_tuple(inputs.assets,finalize);
 
+	// A failed conversion can cause dangling GPU object pointers, and needless work for objects which will die soon after, so prune with a Top-Down pass anything thats not reachable from a root
+	{
+		// we use a genious trick, if someone else is using the GPU object, the refcount must obviously be greater than 1
+		auto pruneStaging = [&]<Asset AssetType>()->void
+		{
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
+			phmap::erase_if(stagingCache,[](const auto& entry)->bool
+				{
+					if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+					{
+						// TODO: gather into m_deferredTLASDescriptorWrites
+					}
+					return entry.first->getReferenceCount()==1;
+				}
+			);
+		};
+		// The order these are called is paramount, the Higher Level User needs to die to let go of dependants and make our Garbage Collection work
+//		pruneStaging.template operator()<ICPUFramebuffer>();
+		pruneStaging.template operator()<ICPUDescriptorSet>();
+		pruneStaging.template operator()<ICPUGraphicsPipeline>();
+		pruneStaging.template operator()<ICPURenderpass>();
+		pruneStaging.template operator()<ICPUComputePipeline>();
+		pruneStaging.template operator()<ICPUPipelineCache>();
+		pruneStaging.template operator()<ICPUPipelineLayout>();
+		pruneStaging.template operator()<ICPUDescriptorSetLayout>();
+		pruneStaging.template operator()<ICPUSampler>();
+		pruneStaging.template operator()<ICPUShader>();
+		pruneStaging.template operator()<ICPUImageView>();
+		pruneStaging.template operator()<ICPUBufferView>();
+		pruneStaging.template operator()<ICPUImage>();
+		// need to nerf any writes to descriptor sets which don't exist anymore before checking the refcounts on them
+		phmap::erase_if(retval.m_deferredTLASDescriptorWrites,[&](const auto& entry)->bool
+			{
+				auto& dsStaging = std::get<SReserveResult::staging_cache_t<ICPUDescriptorSet>>(retval.m_stagingCaches);
+				return dsStaging.find(entry.dstSet)!=dsStaging.end();
+			}
+		);
+		pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
+		pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
+		pruneStaging.template operator()<ICPUBuffer>();
+	}
+
+	// TODO: defer the conversion requests until final objects are known (or knock them out) -> maybe change the conversion requests to unordered_map ?
+
+	// TODO: only now get the queue flags
+
 	retval.m_converter = core::smart_refctd_ptr<CAssetConverter>(this);
 	retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr<system::ILogger>(inputs.logger.get()));
 	return retval;
@@ -3610,6 +3668,40 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		}
 	};
 
+	//
+	auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->core::blake3_hash_t*
+	{
+		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+		const auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(gpuObj));
+		assert(found!=stagingCache.end());
+		return const_cast<core::blake3_hash_t*>(&found->second.value);
+	};
+	// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
+	core::unordered_map<const IBackendObject*,uint32_t> outputReverseMap;
+	core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void
+		{
+			uint32_t i = 0;
+			for (const auto& gpuObj : gpuObjects)
+				outputReverseMap[gpuObj.value.get()] = i++;
+		}
+	);
+	auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>& canonical, const typename asset_traits<AssetType>::video_t* gpuObj, core::blake3_hash_t* hash)->void
+	{
+		// wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
+		canonical = nullptr;
+		logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName());
+		// change the content hash on the reverse map to a NoContentHash
+		*hash = CHashCache::NoContentHash;
+		// also drop the smart pointer from the output array so failures release memory quickly
+		const auto foundIx = outputReverseMap.find(gpuObj);
+		if (foundIx!=outputReverseMap.end())
+		{
+			auto& resultOutput = std::get<SReserveResult::vector_t<AssetType>>(reservations.m_gpuObjects);
+			resultOutput[foundIx->second].value = nullptr;
+			outputReverseMap.erase(foundIx);
+		}
+	};
+
 	// compacted TLASes need to be substituted in cache and Descriptor Sets
 	core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
 	// Anything to do?
@@ -3776,40 +3868,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			return retval;
 		}
 
-		//
-		auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->core::blake3_hash_t*
-		{
-			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-			const auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(gpuObj));
-			assert(found!=stagingCache.end());
-			return const_cast<core::blake3_hash_t*>(&found->second.value);
-		};
-		// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
-		core::unordered_map<const IBackendObject*,uint32_t> outputReverseMap;
-		core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void
-			{
-				uint32_t i = 0;
-				for (const auto& gpuObj : gpuObjects)
-					outputReverseMap[gpuObj.value.get()] = i++;
-			}
-		);
-		auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>& canonical, const typename asset_traits<AssetType>::video_t* gpuObj, core::blake3_hash_t* hash)->void
-		{
-			// wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
-			canonical = nullptr;
-			logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName());
-			// change the content hash on the reverse map to a NoContentHash
-			*hash = CHashCache::NoContentHash;
-			// also drop the smart pointer from the output array so failures release memory quickly
-			const auto foundIx = outputReverseMap.find(gpuObj);
-			if (foundIx!=outputReverseMap.end())
-			{
-				auto& resultOutput = std::get<SReserveResult::vector_t<AssetType>>(reservations.m_gpuObjects);
-				resultOutput[foundIx->second].value = nullptr;
-				outputReverseMap.erase(foundIx);
-			}
-		};
-
 		//
 		core::bitflag<IQueue::FAMILY_FLAGS> submitsNeeded = IQueue::FAMILY_FLAGS::NONE;
 
@@ -5191,16 +5249,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		for (auto& inWrite : tlasWriteMap)
 		{
 			// I know what I'm doing, this member has no influence on the set key hash
-			auto& tlas = const_cast<smart_refctd_ptr<IGPUTopLevelAccelerationStructure>&>(inWrite.tlas);
+			auto tlas = core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(const_cast<IGPUTopLevelAccelerationStructure*>(inWrite.tlas));
 			assert(tlas);
 			if (missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
-			{
-				tlas = nullptr;
 				continue;
-			}
 			if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end())
 				tlas = foundCompacted->second;
-			pInfo->desc = tlas;
+			pInfo->desc = std::move(tlas);
 			writes.push_back({
 				.dstSet = inWrite.dstSet,
 				.binding = inWrite.binding,
@@ -5214,7 +5269,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
 		if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
 		for (auto& inWrite : tlasWriteMap)
-			const_cast<smart_refctd_ptr<IGPUTopLevelAccelerationStructure>&>(inWrite.tlas) = nullptr;
+		{
+			auto* pHash = findInStaging.template operator()<ICPUDescriptorSet>(inWrite.dstSet);
+			smart_refctd_ptr<const ICPUDescriptorSet> dummy;
+			markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,inWrite.dstSet,pHash);
+		}
 	}
 	mergeCache.template operator()<ICPUDescriptorSet>();
 	// needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now

From b044144e5c16c8e5e87ce4e5c5d3392d62f76f10 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 10 May 2025 07:13:19 +0200
Subject: [PATCH 058/346] the deferred TLAS descriptor writes need to refcount
 the TLASes

---
 include/nbl/video/utilities/CAssetConverter.h |  4 ++--
 src/nbl/video/utilities/CAssetConverter.cpp   | 19 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 9175f20a86..829735327c 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1136,13 +1136,13 @@ class CAssetConverter : public core::IReferenceCounted
 				{
 					inline bool operator==(const SDeferredTLASWrite& other) const
 					{
-						return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement;
+						return dstSet==other.dstSet && binding==other.binding && arrayElement==other.arrayElement;
 					}
 
 					IGPUDescriptorSet* dstSet;
 					uint32_t binding;
 					uint32_t arrayElement;
-					const IGPUTopLevelAccelerationStructure* tlas;
+					core::smart_refctd_ptr<const IGPUTopLevelAccelerationStructure> tlas;
 				};
 				struct SDeferredTLASWriteHasher
 				{
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 32f9408365..e1816dbe1e 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1960,7 +1960,7 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
 			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
 			{
-				deferredTLASWrites.push_back({nullptr,binding.data,element,depObj.get()});
+				deferredTLASWrites.push_back({nullptr,binding.data,element,depObj});
 				return true;
 			}
 			//
@@ -3586,11 +3586,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
 			phmap::erase_if(stagingCache,[](const auto& entry)->bool
 				{
-					if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
-					{
-						// TODO: gather into m_deferredTLASDescriptorWrites
-					}
-					return entry.first->getReferenceCount()==1;
+					if (entry.first->getReferenceCount()==1)
+						return true;
+					return false;
 				}
 			);
 		};
@@ -3608,19 +3606,20 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		pruneStaging.template operator()<ICPUImageView>();
 		pruneStaging.template operator()<ICPUBufferView>();
 		pruneStaging.template operator()<ICPUImage>();
-		// need to nerf any writes to descriptor sets which don't exist anymore before checking the refcounts on them
+		// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
 		phmap::erase_if(retval.m_deferredTLASDescriptorWrites,[&](const auto& entry)->bool
 			{
 				auto& dsStaging = std::get<SReserveResult::staging_cache_t<ICPUDescriptorSet>>(retval.m_stagingCaches);
-				return dsStaging.find(entry.dstSet)!=dsStaging.end();
+				return dsStaging.find(entry.dstSet)==dsStaging.end();
 			}
 		);
 		pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
+// go over 
 		pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
 		pruneStaging.template operator()<ICPUBuffer>();
 	}
 
-	// TODO: defer the conversion requests until final objects are known (or knock them out) -> maybe change the conversion requests to unordered_map ?
+	// TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ?
 
 	// TODO: only now get the queue flags
 
@@ -5249,7 +5248,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		for (auto& inWrite : tlasWriteMap)
 		{
 			// I know what I'm doing, this member has no influence on the set key hash
-			auto tlas = core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(const_cast<IGPUTopLevelAccelerationStructure*>(inWrite.tlas));
+			auto tlas = core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(const_cast<IGPUTopLevelAccelerationStructure*>(inWrite.tlas.get()));
 			assert(tlas);
 			if (missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
 				continue;

From 8555fad476c7ee91e8bfb37ab23d05b3ce2de83e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 10 May 2025 14:44:42 +0200
Subject: [PATCH 059/346] turned conversion requests into `unordered_map`s so
 they're easier to knock out

---
 include/nbl/video/utilities/CAssetConverter.h |  36 +--
 src/nbl/video/utilities/CAssetConverter.cpp   | 244 ++++++++++--------
 2 files changed, 156 insertions(+), 124 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 829735327c..12326acc6c 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1085,36 +1085,29 @@ class CAssetConverter : public core::IReferenceCounted
 				core::tuple_transform_t<staging_cache_t,supported_asset_types> m_stagingCaches;
 
 				// need a more explicit list of GPU objects that need device-assisted conversion
-				template<asset::Asset AssetType>
-				struct SConversionRequestBase
-				{
-					// canonical asset (the one that provides content)
-					core::smart_refctd_ptr<const AssetType> canonical;
-					// gpu object to transfer canonical's data to or build it from
-					asset_traits<AssetType>::video_t* gpuObj;
-				};
-				using SConvReqBuffer = SConversionRequestBase<asset::ICPUBuffer>;
-				core::vector<SConvReqBuffer> m_bufferConversions;
-				struct SConvReqImage : SConversionRequestBase<asset::ICPUImage>
+				core::unordered_map<IGPUBuffer*,core::smart_refctd_ptr<const asset::ICPUBuffer>> m_bufferConversions;
+				struct SConvReqImage
 				{
+					core::smart_refctd_ptr<const asset::ICPUImage> canonical = nullptr;
 					uint16_t recomputeMips = 0;
 				};
-				core::vector<SConvReqImage> m_imageConversions;
+				core::unordered_map<IGPUImage*,SConvReqImage> m_imageConversions;
 				template<typename CPUAccelerationStructure>
-				struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
+				struct SConvReqAccelerationStructure
 				{
 					using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
 					inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
 					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
 
+					core::smart_refctd_ptr<const CPUAccelerationStructure> canonical = nullptr;
 					uint64_t scratchSize : 45;
 					uint64_t compact : 1;
 					uint64_t buildFlags : 16 = 0;
 				};
 				using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
-				core::vector<SConvReqBLAS> m_blasConversions[2];
+				core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqBLAS> m_blasConversions[2];
 				using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
-				core::vector<SConvReqTLAS> m_tlasConversions[2];
+				core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS> m_tlasConversions[2];
 
 				// array index 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
@@ -1136,25 +1129,22 @@ class CAssetConverter : public core::IReferenceCounted
 				{
 					inline bool operator==(const SDeferredTLASWrite& other) const
 					{
-						return dstSet==other.dstSet && binding==other.binding && arrayElement==other.arrayElement;
+						return binding==other.binding && arrayElement==other.arrayElement;
 					}
 
-					IGPUDescriptorSet* dstSet;
 					uint32_t binding;
 					uint32_t arrayElement;
-					core::smart_refctd_ptr<const IGPUTopLevelAccelerationStructure> tlas;
+					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
 				};
 				struct SDeferredTLASWriteHasher
 				{
 					inline size_t operator()(const SDeferredTLASWrite& write) const
 					{
-						size_t retval = std::bit_cast<size_t>(write.dstSet);
-						core::hash_combine(retval,write.binding);
-						core::hash_combine(retval,write.arrayElement);
-						return retval;
+						return std::hash<uint64_t>()((uint64_t(write.binding)<<32)|write.arrayElement);
 					}
 				};
-				core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher> m_deferredTLASDescriptorWrites;
+				using deferred_tlas_write_set_t = core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher>;
+				core::unordered_map<IGPUDescriptorSet*,deferred_tlas_write_set_t> m_deferredTLASDescriptorWrites;
 
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index e1816dbe1e..b90be0b323 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1881,8 +1881,6 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 		// returns if there are any writes to do
 		bool finalizeWrites(IGPUDescriptorSet* dstSet)
 		{
-			for (auto& deferredWrite : deferredTLASWrites)
-				deferredWrite.dstSet = dstSet;
 			if (writes.empty())
 				return false;
 			// now infos can't move in memory anymore
@@ -1899,7 +1897,7 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 		// okay to do non-owning, cache has ownership
 		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes = {};
 		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos = {};
-		core::vector<CAssetConverter::SReserveResult::SDeferredTLASWrite> deferredTLASWrites;
+		CAssetConverter::SReserveResult::deferred_tlas_write_set_t deferredTLASWrites;
 		// has to be public because of aggregate init, but its only for internal usage!
 		uint32_t lastBinding;
 		uint32_t lastElement;
@@ -1960,7 +1958,8 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
 			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
 			{
-				deferredTLASWrites.push_back({nullptr,binding.data,element,depObj});
+				const auto [where,inserted] =deferredTLASWrites.insert({binding.data,element,depObj});
+				assert(inserted);
 				return true;
 			}
 			//
@@ -3377,7 +3376,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								ds = nullptr;
 							}
 							else
-								retval.m_deferredTLASDescriptorWrites.insert(visitor.deferredTLASWrites.begin(),visitor.deferredTLASWrites.end());
+								retval.m_deferredTLASDescriptorWrites[ds.get()] = std::move(visitor.deferredTLASWrites);
 						}
 						else
 							inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName());
@@ -3415,11 +3414,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			for (auto i=0ull; i<entry.second.copyCount; i++)
 			if (auto& gpuBuff=bufferConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuBuff)
 			{
-				const auto boundMemory = gpuBuff->getBoundMemory();
-				assert(boundMemory.isValid());
-				if (!canHostWriteToMemoryRange(boundMemory,gpuBuff->getSize()))
-					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-				retval.m_bufferConversions.push_back({core::smart_refctd_ptr<const ICPUBuffer>(entry.second.canonicalAsset),gpuBuff.get()});
+				auto [where,inserted] = retval.m_bufferConversions.insert({gpuBuff.get(),core::smart_refctd_ptr<const ICPUBuffer>(entry.second.canonicalAsset)});
+				assert(inserted);
 			}
 			bufferConversions.propagateToCaches(std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBuffer>>(retval.m_stagingCaches));
 			// Deal with Deferred Creation of Acceleration structures
@@ -3431,7 +3427,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					// TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
 					size_t scratchSizeFullParallelBuild[2] = {0,0};
 					//
-					core::vector<SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
+					core::unordered_map<typename asset_traits<AccelerationStructure>::video_t*,SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
 					if constexpr (IsTLAS)
 						pConversions = retval.m_tlasConversions;
 					else
@@ -3482,9 +3478,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							continue;
 						}
 						// file the request for conversion
-						auto& request = pConversions[patch.hostBuild].emplace_back();
+						auto& request = pConversions[patch.hostBuild][as.get()];
 						request.canonical = smart_refctd_ptr<const AccelerationStructure>(canonical);
-						request.gpuObj = as.get();
 						request.scratchSize = deferredParams.scratchSize;
 						request.compact = patch.compactAfterBuild;
 						request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
@@ -3510,20 +3505,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			auto& dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
 			for (auto& entry : imageConversions.contentHashToCanonical)
 			for (auto i=0ull; i<entry.second.copyCount; i++)
-			if (auto& gpuImg=imageConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuImg && !entry.second.canonicalAsset->getRegions().empty())
 			{
-				const auto boundMemory = gpuImg->getBoundMemory();
-				assert(boundMemory.isValid());
-				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-				const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips;
-				if (recomputeMips)
-					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-				// Best effort guess, without actually looking at all regions
-				const auto& params = gpuImg->getCreationParameters();
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
-				if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
-					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
-				retval.m_imageConversions.push_back({{core::smart_refctd_ptr<const ICPUImage>(entry.second.canonicalAsset),gpuImg.get()},recomputeMips});
+				const auto* cpuImg = entry.second.canonicalAsset;
+				if (auto& gpuImg=imageConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuImg && !cpuImg->getRegions().empty())
+				{
+					const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips;
+					auto [where,inserted] = retval.m_imageConversions.insert({gpuImg.get(),SReserveResult::SConvReqImage{core::smart_refctd_ptr<const ICPUImage>(cpuImg),recomputeMips}});
+					assert(inserted);
+				}
 			}
 			imageConversions.propagateToCaches(dfsCacheImages,std::get<SReserveResult::staging_cache_t<ICPUImage>>(retval.m_stagingCaches));
 		}
@@ -3584,10 +3573,25 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		auto pruneStaging = [&]<Asset AssetType>()->void
 		{
 			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
-			phmap::erase_if(stagingCache,[](const auto& entry)->bool
+			phmap::erase_if(stagingCache,[&retval](const auto& entry)->bool
 				{
 					if (entry.first->getReferenceCount()==1)
+					{
+						if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
+							retval.m_bufferConversions.erase(entry.first);
+						if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure>)
+						{
+						}
+						if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+						{
+						}
+						if constexpr (std::is_same_v<AssetType,ICPUImage>)
+							retval.m_imageConversions.erase(entry.first);
+						// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
+						if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
+							retval.m_deferredTLASDescriptorWrites.erase(entry.first);
 						return true;
+					}
 					return false;
 				}
 			);
@@ -3606,13 +3610,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		pruneStaging.template operator()<ICPUImageView>();
 		pruneStaging.template operator()<ICPUBufferView>();
 		pruneStaging.template operator()<ICPUImage>();
-		// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
-		phmap::erase_if(retval.m_deferredTLASDescriptorWrites,[&](const auto& entry)->bool
-			{
-				auto& dsStaging = std::get<SReserveResult::staging_cache_t<ICPUDescriptorSet>>(retval.m_stagingCaches);
-				return dsStaging.find(entry.dstSet)==dsStaging.end();
-			}
-		);
 		pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
 // go over 
 		pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
@@ -3621,7 +3618,35 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 
 	// TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ?
 
-	// TODO: only now get the queue flags
+	// only now get the queue flags
+	{
+		using q_fam_f = IQueue::FAMILY_FLAGS;
+		// images are trickier, we can't finish iterating until all possible flags are there
+		for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++)
+		{
+			const auto boundMemory = it->first->getBoundMemory();
+			assert(boundMemory.isValid());
+			// Note: with `host_image_copy` this will get conditional
+			{
+				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+				// Best effort guess, without actually looking at all regions
+				const auto& params = it->first->getCreationParameters();
+				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
+				if (isDepthOrStencilFormat(params.format) && (params.depthUsage | params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
+				if (it->second.recomputeMips)
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+			}
+		}
+		// buffer conversions
+		for (auto it=retval.m_bufferConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT) && it!=retval.m_bufferConversions.end(); it++)
+		{
+			const auto boundMemory = it->first->getBoundMemory();
+			assert(boundMemory.isValid());
+			if (!canHostWriteToMemoryRange(boundMemory,it->first->getSize()))
+				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+		}
+	}
 
 	retval.m_converter = core::smart_refctd_ptr<CAssetConverter>(this);
 	retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr<system::ILogger>(inputs.logger.get()));
@@ -3648,15 +3673,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	{
 		for (; hostBufferXferIt!=reservations.m_bufferConversions.end() && pred(); hostBufferXferIt++)
 		{
-			const size_t size = hostBufferXferIt->gpuObj->getSize();
-			const auto boundMemory = hostBufferXferIt->gpuObj->getBoundMemory();
+			IGPUBuffer* buff = hostBufferXferIt->first;
+			const size_t size = buff->getSize();
+			const auto boundMemory = buff->getBoundMemory();
 			if (!canHostWriteToMemoryRange(boundMemory,size))
 				continue;
 			auto* const memory = boundMemory.memory;
 			const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size};
-			memcpy(reinterpret_cast<uint8_t*>(memory->getMappedPointer())+range.offset,hostBufferXferIt->canonical->getPointer(),size);
+			memcpy(reinterpret_cast<uint8_t*>(memory->getMappedPointer())+range.offset,hostBufferXferIt->second->getPointer(),size);
 			// let go of canonical asset (may free RAM)
-			hostBufferXferIt->canonical = nullptr;
+			hostBufferXferIt->second = nullptr;
 			if (memory->haveToMakeVisible())
 				memoryHostFlushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
 		}
@@ -3932,8 +3958,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			}
 			for (auto& item : buffersToUpload)
 			{
-				auto* buffer = item.gpuObj;
-				const size_t size = item.gpuObj->getCreationParams().size;
+				auto* buffer = item.first;
+				const size_t size = buffer->getCreationParams().size;
 				// host will upload
 				if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size))
 					continue;
@@ -3942,21 +3968,21 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily);
 				if (ownerQueueFamily==QueueFamilyInvalid)
 				{
-					markFailureInStaging("invalid Final Queue Family given by user callback",item.canonical,buffer,pFoundHash);
+					markFailureInStaging("invalid Final Queue Family given by user callback",item.second,buffer,pFoundHash);
 					continue;
 				}
 				// do the upload
 				const SBufferRange<IGPUBuffer> range = {.offset=0,.size=size,.buffer=core::smart_refctd_ptr<IGPUBuffer>(buffer)};
-				const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer());
+				const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.second->getPointer());
 				// current recording buffer may have changed
 				xferCmdBuf = params.transfer->getCommandBufferForRecording();
 				if (!success)
 				{
-					markFailureInStaging("Data Upload",item.canonical,buffer,pFoundHash);
+					markFailureInStaging("Data Upload",item.second,buffer,pFoundHash);
 					continue;
 				}
 				// let go of canonical asset (may free RAM)
-				item.canonical = nullptr;
+				item.second = nullptr;
 				submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
 				// enqueue ownership release if necessary
 				if (ownerQueueFamily!=IQueue::FamilyIgnored)
@@ -4116,8 +4142,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			for (auto& item : imagesToUpload)
 			{
 				// basiscs
-				const auto* cpuImg = item.canonical.get();
-				auto* image = item.gpuObj;
+				auto& cpuImg = item.second.canonical;
+				auto* image = item.first;
 				auto pFoundHash = findInStaging.template operator()<ICPUImage>(image);
 				// get params
 				const auto& creationParams = image->getCreationParameters();
@@ -4136,7 +4162,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				});
 				IGPUImageView::E_TYPE viewType = IGPUImageView::E_TYPE::ET_2D_ARRAY;
 				// create Mipmapping source Image View, allocate its place in the descriptor set and write it
-				if (item.recomputeMips)
+				if (item.second.recomputeMips)
 				{
 					switch (creationParams.type)
 					{
@@ -4168,7 +4194,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					}
 					if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView)))
 					{
-						markFailureInStaging("Source Mip Level Descriptor Write",item.canonical,image,pFoundHash);
+						markFailureInStaging("Source Mip Level Descriptor Write",cpuImg,image,pFoundHash);
 						continue;
 					}
 				}
@@ -4177,7 +4203,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				{
 					// Transfer and Compute barriers get recorded for image individually (see the TODO why its horrible)
 					// so we only need to worry about QFOTs for current image if they even exist
-					if (item.recomputeMips && !transferBarriers.empty())
+					if (item.second.recomputeMips && !transferBarriers.empty())
 					{
 						// so now we need a immeidate QFOT Release cause we already recorded some compute mipmapping for current image
 						if (pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Recording QFOT Release from Transfer Queue Family after overflow failed"))
@@ -4189,7 +4215,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						}
 						else
 						{
-							markFailureInStaging("Image QFOT Pipeline Barrier",item.canonical,image,pFoundHash);
+							markFailureInStaging("Image QFOT Pipeline Barrier",cpuImg,image,pFoundHash);
 							return false;
 						}
 						return true;
@@ -4205,6 +4231,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					computeBarriers.clear();
 					const bool concurrentSharing = image->getCachedCreationParams().isConcurrentSharing();
 					uint8_t lvl = 0;
+					const auto recomputeMipMask = item.second.recomputeMips;
 					bool _prevRecompute = false;
 					for (; lvl<creationParams.mipLevels; lvl++)
 					{
@@ -4235,7 +4262,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// if any op, it will always be a release (Except acquisition of first source mip in compute)
 						barrier.ownershipOp = ownership_op_t::RELEASE;
 						// if we're recomputing this mip level 
-						const bool recomputeMip = lvl && (item.recomputeMips&(0x1u<<(lvl-1)));
+						const bool recomputeMip = lvl && (recomputeMipMask&(0x1u<<(lvl-1)));
 						// query final layout from callback
 						const auto finalLayout = params.getFinalLayout(image,*pFoundHash,lvl);
 						// get region data for upload
@@ -4434,7 +4461,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
 								barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
 								// whether next mip will need to read from this one to recompute itself
-								const bool sourceForNextMipCompute = item.recomputeMips&(0x1u<<lvl);
+								const bool sourceForNextMipCompute = item.second.recomputeMips&(0x1u<<lvl);
 								// keep in general layout to avoid a transfer->general transition
 								tmp.newLayout = sourceForNextMipCompute ? layout_t::GENERAL : layout_t::TRANSFER_DST_OPTIMAL;
 								// fire off the pipeline barrier so we can start uploading right away
@@ -4503,18 +4530,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					// failed in the for-loop
 					if (lvl != creationParams.mipLevels)
 					{
-						markFailureInStaging("Compute Mip Mapping",item.canonical,image,pFoundHash);
+						markFailureInStaging("Compute Mip Mapping",cpuImg,image,pFoundHash);
 						continue;
 					}
 					// let go of canonical asset (may free RAM)
-					item.canonical = nullptr;
+					cpuImg = nullptr;
 				}
 				// here we only record barriers that do final layout transitions and release ownership to final queue family
 				if (!transferBarriers.empty())
 				{
 					if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed"))
 					{
-						markFailureInStaging("Image Data Upload Pipeline Barrier",item.canonical,image,pFoundHash);
+						markFailureInStaging("Image Data Upload Pipeline Barrier",cpuImg,image,pFoundHash);
 						continue;
 					}
 					// even if no uploads performed, we do layout transitions on empty images from Xfer Queue
@@ -4526,7 +4553,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore());
 					if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed"))
 					{
-						markFailureInStaging("Compute Mip Mapping Pipeline Barrier",item.canonical,image,pFoundHash);
+						markFailureInStaging("Compute Mip Mapping Pipeline Barrier",cpuImg,image,pFoundHash);
 						continue;
 					}
 				}
@@ -4751,17 +4778,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					for (auto& tlasToBuild : tlasesToBuild)
 					{
 						dedupBLASesUsed.clear();
-						const auto as = tlasToBuild.gpuObj;
+						auto& canonical = tlasToBuild.second.canonical;
+						const auto as = tlasToBuild.first;
 						const auto pFoundHash = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(as);
 						const auto& backingRange = as->getCreationParams().bufferRange;
 						// checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters
 						const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,*pFoundHash),computeFamily);
 						if (finalOwnerQueueFamily==QueueFamilyInvalid)
 						{
-							markFailureInStaging("invalid Final Queue Family given by user callback",tlasToBuild.canonical,as,pFoundHash);
+							markFailureInStaging("invalid Final Queue Family given by user callback",canonical,as,pFoundHash);
 							continue;
 						}
-						const auto instances = tlasToBuild.canonical->getInstances();
+						const auto instances = canonical->getInstances();
 						const auto instanceCount = static_cast<uint32_t>(instances.size());
 						size_t instanceDataSize = 0;
 						// gather total input size and check dependants exist
@@ -4779,13 +4807,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// problem with finding the dependents (BLASes)
 						if (instanceDataSize==0)
 						{
-							markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",tlasToBuild.canonical,as,pFoundHash);
+							markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",canonical,as,pFoundHash);
 							continue;
 						}
 						// allocate scratch and build inputs
 						constexpr uint32_t MaxAllocCount = 3;
 						addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
-						const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
+						const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
 						{
 							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)};
 							const auto AllocCount = as->usesMotion() ? 2:3;
@@ -4879,16 +4907,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							xferCmdBuf = params.transfer->getCommandBufferForRecording();
 							if (!success)
 							{
-								markFailureInStaging("Uploading Instance Data for TLAS build failed",tlasToBuild.canonical,as,pFoundHash);
+								markFailureInStaging("Uploading Instance Data for TLAS build failed",canonical,as,pFoundHash);
 								continue;
 							}
 							// let go of canonical asset (may free RAM)
-							tlasToBuild.canonical = nullptr;
+							canonical = nullptr;
 						}
 						// prepare build infos
 						auto& buildInfo = buildInfos.emplace_back();
 						buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
-						buildInfo.buildFlags = tlasToBuild.getBuildFlags();
+						buildInfo.buildFlags = tlasToBuild.second.getBuildFlags();
 						buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion();
 						buildInfo.dstAS = as;
 						// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
@@ -4905,7 +4933,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// no special extra byte offset into the instance buffer
 						rangeInfos.emplace_back(instanceCount,0u);
 						//
-						const bool willCompact = tlasToBuild.compact;
+						const bool willCompact = tlasToBuild.second.compact;
 						if (willCompact)
 							compactions.push_back(as);
 						// enqueue ownership release if necessary
@@ -5180,13 +5208,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
 								const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
 								const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
-								const auto foundWrite = reservations.m_deferredTLASDescriptorWrites.find({
-									.dstSet = item.first,
-									.binding = redirect.getBinding(bindingRange).data,
-									.arrayElement = i-firstElementOffset
-								});
-								// was scheduled to write some TLAS to this binding, but TLAS is now null
-								depsMissing = foundWrite!=reservations.m_deferredTLASDescriptorWrites.end() && !foundWrite->tlas;
+								auto foundSet = reservations.m_deferredTLASDescriptorWrites.find(item.first);
+								if (foundSet!=reservations.m_deferredTLASDescriptorWrites.end())
+								{
+									const auto foundWrite = foundSet->second.find({
+										.binding = redirect.getBinding(bindingRange).data,
+										.arrayElement = i-firstElementOffset
+									});
+									// was scheduled to write some TLAS to this binding, but TLAS is now null
+									depsMissing = foundWrite!=foundSet->second.end() && !foundWrite->tlas;
+								}
+								else
+									depsMissing = true;
 								break;
 							}
 							default:
@@ -5239,40 +5272,49 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	mergeCache.template operator()<ICPURenderpass>();
 	mergeCache.template operator()<ICPUGraphicsPipeline>();
 	// write the TLASes into Descriptor Set finally
-	if (auto& tlasWriteMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteMap.empty())
+	if (auto& tlasWriteDSMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteDSMap.empty())
 	{
 		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
-		writes.reserve(tlasWriteMap.size());
-		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(writes.size());
-		auto* pInfo = infos.data();
-		for (auto& inWrite : tlasWriteMap)
+		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
+		for (auto& tlasWriteMap : tlasWriteDSMap)
 		{
-			// I know what I'm doing, this member has no influence on the set key hash
-			auto tlas = core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(const_cast<IGPUTopLevelAccelerationStructure*>(inWrite.tlas.get()));
-			assert(tlas);
-			if (missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
-				continue;
-			if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end())
-				tlas = foundCompacted->second;
-			pInfo->desc = std::move(tlas);
-			writes.push_back({
-				.dstSet = inWrite.dstSet,
-				.binding = inWrite.binding,
-				.arrayElement = inWrite.arrayElement,
-				.count = 1,
-				.info = pInfo++
-			});
+			writes.clear();
+			infos.clear();
+			auto* dstSet = tlasWriteMap.first;
+			for (auto& inWrite : tlasWriteMap.second)
+			{
+				// I know what I'm doing, this member has no influence on the set key hash or equal comparison operator
+				auto& tlas = const_cast<smart_refctd_ptr<IGPUTopLevelAccelerationStructure>&>(inWrite.tlas);
+				assert(tlas);
+				if (missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
+				{
+					tlas = {};
+					continue;
+				}
+				if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end())
+					tlas = foundCompacted->second;
+				infos.emplace_back().desc = std::move(tlas);
+				writes.push_back({
+					.dstSet = dstSet,
+					.binding = inWrite.binding,
+					.arrayElement = inWrite.arrayElement,
+					.count = 1
+				});
+			}
+			//
+			auto* pInfo = infos.data();
+			for (auto& outWrite : writes)
+				outWrite.info = pInfo++;
+			// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
+			if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
+			{
+				auto* pHash = findInStaging.template operator()<ICPUDescriptorSet>(dstSet);
+				smart_refctd_ptr<const ICPUDescriptorSet> dummy;
+				markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,dstSet,pHash);
+			}
 		}
 		// not strictly necessary, just provoking refcounting bugs right away if they exist
 		compactedTLASMap.clear();
-		// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
-		if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
-		for (auto& inWrite : tlasWriteMap)
-		{
-			auto* pHash = findInStaging.template operator()<ICPUDescriptorSet>(inWrite.dstSet);
-			smart_refctd_ptr<const ICPUDescriptorSet> dummy;
-			markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,inWrite.dstSet,pHash);
-		}
 	}
 	mergeCache.template operator()<ICPUDescriptorSet>();
 	// needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now

From 69df18a8d115d9a201eff57afc90ce3e5e75f5f2 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 11 May 2025 00:11:21 +0200
Subject: [PATCH 060/346] save progress before attempting to remove
 `m_deferredTLASDescriptorWrites`

---
 include/nbl/video/utilities/CAssetConverter.h |  10 +-
 src/nbl/video/utilities/CAssetConverter.cpp   | 117 +++++++++++-------
 2 files changed, 81 insertions(+), 46 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 12326acc6c..e309a24fc3 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1103,11 +1103,13 @@ class CAssetConverter : public core::IReferenceCounted
 					uint64_t scratchSize : 45;
 					uint64_t compact : 1;
 					uint64_t buildFlags : 16 = 0;
+					// scratch + input size also accounting for worst case padding due to alignment
+					uint64_t buildSize;
 				};
-				using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
-				core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqBLAS> m_blasConversions[2];
-				using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
-				core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS> m_tlasConversions[2];
+				template<typename CPUAccelerationStructure>
+				using SConvReqAccelerationStructureMap = core::unordered_map<typename asset_traits<CPUAccelerationStructure>::video_t*,SConvReqAccelerationStructure<CPUAccelerationStructure>>;
+				SConvReqAccelerationStructureMap<asset::ICPUBottomLevelAccelerationStructure> m_blasConversions[2];
+				SConvReqAccelerationStructureMap<asset::ICPUTopLevelAccelerationStructure> m_tlasConversions[2];
 
 				// array index 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index b90be0b323..4fadb1ee7f 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1654,10 +1654,6 @@ class GetDependantVisit;
 template<>
 class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
 {
-	public:
-		// TODO: deal with usages not going through because of cancelled TLAS builds, by gathering in a top-down pass at the end of `reserve`
-		CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap = nullptr;
-
 	protected:
 		bool descend_impl(
 			const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
@@ -1668,16 +1664,6 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 			auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			if (blasBuildMap)
-			{
-				const auto instances = user.asset->getInstances();
-				assert(instanceIndex<instances.size());
-				auto foundBLAS = blasBuildMap->find(dep.asset);
-				if (foundBLAS!=blasBuildMap->end())
-					foundBLAS->second.remainingUsages++;
-				else
-					blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}});
-			}
 			return true;
 		}
 };
@@ -1958,9 +1944,13 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
 			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
 			{
-				const auto [where,inserted] =deferredTLASWrites.insert({binding.data,element,depObj});
-				assert(inserted);
-				return true;
+				// not built yet?
+				if (depObj->)
+				{
+					const auto [where,inserted] = deferredTLASWrites.insert({binding.data,element,depObj});
+					assert(inserted);
+					return true;
+				}
 			}
 			//
 			auto& outInfo = infos.emplace_back();
@@ -3420,19 +3410,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			bufferConversions.propagateToCaches(std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBuffer>>(retval.m_stagingCaches));
 			// Deal with Deferred Creation of Acceleration structures
 			{
-				const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
 				auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
 				{
 					constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
-					// TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
-					size_t scratchSizeFullParallelBuild[2] = {0,0};
 					//
-					core::unordered_map<typename asset_traits<AccelerationStructure>::video_t*,SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
+					SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
 					if constexpr (IsTLAS)
 						pConversions = retval.m_tlasConversions;
 					else
 						pConversions = retval.m_blasConversions;
-					// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
+					// we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created
 					for (size_t i=0; i<accelerationStructureParams[IsTLAS].size(); i++)
 					if (const auto& deferredParams=accelerationStructureParams[IsTLAS][i]; deferredParams.storage)
 					{
@@ -3454,7 +3441,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						{
 							// check if the BLASes we want to use for the instances were successfully allocated and created
 							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
-								{inputs,dfsCaches,&retval.m_blasBuildMap},
+								{inputs,dfsCaches},
 								{canonical,deferredParams.uniqueCopyGroupID},
 								patch
 							};
@@ -3483,23 +3470,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						request.scratchSize = deferredParams.scratchSize;
 						request.compact = patch.compactAfterBuild;
 						request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
-						// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
-						retval.m_minASBuildScratchSize[patch.hostBuild] = core::max(retval.m_minASBuildScratchSize[patch.hostBuild],deferredParams.buildSize);
-						scratchSizeFullParallelBuild[patch.hostBuild] += deferredParams.buildSize;
-						// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
-						if (patch.compactAfterBuild)
-							retval.m_compactedASMaxMemory += bufSz;
+						request.buildSize = deferredParams.buildSize;
 					}
-					retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
-					retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
 				};
 				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
 				blasConversions.propagateToCaches(std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches));
 				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
 				tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
-				//
-				if (retval.willDeviceASBuild() || retval.willCompactAS())
-					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 			}
 			// find out which images need what caps for the transfer and mipmapping
 			auto& dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
@@ -3580,11 +3557,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
 							retval.m_bufferConversions.erase(entry.first);
 						if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure>)
-						{
-						}
+						for (auto i=0; i<2; i++)
+							retval.m_blasConversions[i].erase(entry.first);
 						if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
-						{
-						}
+						for (auto i=0; i<2; i++)
+							retval.m_tlasConversions[i].erase(entry.first);
 						if constexpr (std::is_same_v<AssetType,ICPUImage>)
 							retval.m_imageConversions.erase(entry.first);
 						// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
@@ -3592,6 +3569,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							retval.m_deferredTLASDescriptorWrites.erase(entry.first);
 						return true;
 					}
+					// still referenced, keep it around
 					return false;
 				}
 			);
@@ -3611,16 +3589,71 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		pruneStaging.template operator()<ICPUBufferView>();
 		pruneStaging.template operator()<ICPUImage>();
 		pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
-// go over 
+		// go over future TLAS builds to gather used BLASes
+		for (auto i=0; i<2; i++)
+		for (const auto& req : retval.m_tlasConversions[i])
+		{
+			auto* const cpuTLAS = req.second.canonical.get();
+			assert(cpuTLAS);
+			for (const auto& instance : cpuTLAS->getInstances())
+			{
+				auto* const cpuBLAS = instance.getBase().blas.get();
+				auto foundBLAS = retval.m_blasBuildMap.find(cpuBLAS);
+				if (foundBLAS!=retval.m_blasBuildMap.end())
+					foundBLAS->second.remainingUsages++;
+				else
+				{
+					smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
+// TODO
+					retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}});
+				}
+			}
+		}
 		pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
 		pruneStaging.template operator()<ICPUBuffer>();
 	}
 
-	// TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ?
-
 	// only now get the queue flags
 	{
 		using q_fam_f = IQueue::FAMILY_FLAGS;
+		// acceleration structures, get scratch size
+		auto computeAccelerationStructureScratchSizes = [device,&retval]<typename AccelerationStructure>()->void
+		{
+			constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
+			const auto& limits = device->getPhysicalDevice()->getLimits();
+			const auto minScratchAlignment = limits.minAccelerationStructureScratchOffsetAlignment;
+			// index 0 is device build, 1 is host build
+			size_t scratchSizeFullParallelBuild[2] = {0,0};
+			//
+			const SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
+			if constexpr (IsTLAS)
+				pConversions = retval.m_tlasConversions;
+			else
+				pConversions = retval.m_blasConversions;
+			// we collect the stats AFTER making sure only needed TLAS and BLAS will be built
+			for (auto i=0; i<2; i++)
+			for (auto req : pConversions[i])
+			{
+				const auto buildSize = req.second.buildSize;
+				// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
+				retval.m_minASBuildScratchSize[i] = core::max(retval.m_minASBuildScratchSize[i],buildSize);
+				scratchSizeFullParallelBuild[i] = core::alignUp(scratchSizeFullParallelBuild[i],minScratchAlignment)+buildSize;
+				// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
+				if (req.second.compact)
+				{
+					const auto asSize = req.first->getCreationParams().bufferRange.size;
+					assert(core::is_aligned_to(asSize,256));
+					retval.m_compactedASMaxMemory += asSize;
+				}
+			}
+			// TLAS and BLAS can't build concurrently
+			retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
+			retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
+		};
+		computeAccelerationStructureScratchSizes.template operator()<ICPUBottomLevelAccelerationStructure>();
+		computeAccelerationStructureScratchSizes.template operator()<ICPUTopLevelAccelerationStructure>();
+		if (retval.willDeviceASBuild() || retval.willCompactAS())
+			retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 		// images are trickier, we can't finish iterating until all possible flags are there
 		for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++)
 		{
@@ -3632,7 +3665,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				// Best effort guess, without actually looking at all regions
 				const auto& params = it->first->getCreationParameters();
 				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
-				if (isDepthOrStencilFormat(params.format) && (params.depthUsage | params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
+				if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
 					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
 				if (it->second.recomputeMips)
 					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;

From 99e473d984e1be5f9bac070aac31ce4879c08fe2 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 11 May 2025 01:57:40 +0200
Subject: [PATCH 061/346] Ok, so descriptor sets can actually track TLASes
 which are yet-unbuilt, makes life a lot easier.

add `IGPUTopLevelAccelerationStructure::getPendingBuildVer()` to detect if TLAS built yet

also make sure the maxInstanceCount gets hashed properly
---
 include/nbl/video/IGPUAccelerationStructure.h |   2 +
 include/nbl/video/utilities/CAssetConverter.h |  16 ++-
 src/nbl/video/utilities/CAssetConverter.cpp   | 131 +++++++-----------
 3 files changed, 59 insertions(+), 90 deletions(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index c3a24080d0..60c6add5fb 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -667,6 +667,8 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 
 		//
 		using build_ver_t = uint32_t;
+		//
+		inline build_ver_t getPendingBuildVer() const {return m_pendingBuildVer;}
 		// this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission
 		inline build_ver_t registerNextBuildVer()
 		{
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index e309a24fc3..f7faa9598b 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1131,22 +1131,24 @@ class CAssetConverter : public core::IReferenceCounted
 				{
 					inline bool operator==(const SDeferredTLASWrite& other) const
 					{
-						return binding==other.binding && arrayElement==other.arrayElement;
+						return dstSet==other.dstSet && storageOffset.data==other.storageOffset.data;
 					}
 
-					uint32_t binding;
-					uint32_t arrayElement;
-					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
+					IGPUDescriptorSet* dstSet;
+					// binding and array element rolled up into one
+					IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t storageOffset;
 				};
 				struct SDeferredTLASWriteHasher
 				{
 					inline size_t operator()(const SDeferredTLASWrite& write) const
 					{
-						return std::hash<uint64_t>()((uint64_t(write.binding)<<32)|write.arrayElement);
+						size_t retval = write.storageOffset.data;
+						core::hash_combine(retval,write.dstSet);
+						return retval;
 					}
 				};
-				using deferred_tlas_write_set_t = core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher>;
-				core::unordered_map<IGPUDescriptorSet*,deferred_tlas_write_set_t> m_deferredTLASDescriptorWrites;
+				using compacted_tlas_rewrite_set_t = core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher>;
+				compacted_tlas_rewrite_set_t m_potentialTLASRewrites;
 
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 4fadb1ee7f..733be3f058 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -612,8 +612,9 @@ class AssetVisitor : public CRTP
 					const IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t storageRangeIx(j);
 					const auto binding = redirect.getBinding(storageRangeIx);
 					const uint32_t count = redirect.getCount(storageRangeIx);
-					// this is where the descriptors have their flattened place in a unified array 
-					const auto* infos = allInfos.data()+redirect.getStorageOffset(storageRangeIx).data;
+					// this is where the descriptors have their flattened place in a unified array
+					const auto storageBaseOffset = redirect.getStorageOffset(storageRangeIx);
+					const auto* infos = allInfos.data()+storageBaseOffset.data;
 					for (uint32_t el=0u; el<count; el++)
 					{
 						const auto& info = infos[el];
@@ -702,7 +703,7 @@ class AssetVisitor : public CRTP
 							case IDescriptor::EC_ACCELERATION_STRUCTURE:
 							{
 								auto tlas = static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc);
-								if (!descend(tlas,{tlas},type,binding,el))
+								if (!descend(tlas,{tlas},type,binding,el,storageBaseOffset))
 									return false;
 								break;
 							}
@@ -1164,6 +1165,7 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUTopLevelAcc
 	// extras from the patch
 	hasher << lookup.patch->hostBuild;
 	hasher << lookup.patch->compactAfterBuild;
+	hasher << (lookup.patch->isMotion ? lookup.patch->maxInstances:0u);
 	const auto instances = asset->getInstances();
 	hasher << instances.size();
 	AssetVisitor<HashVisit<ICPUTopLevelAccelerationStructure>> visitor = {
@@ -1883,7 +1885,7 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 		// okay to do non-owning, cache has ownership
 		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes = {};
 		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos = {};
-		CAssetConverter::SReserveResult::deferred_tlas_write_set_t deferredTLASWrites;
+		core::vector<IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t> potentialTLASRewrites = {};
 		// has to be public because of aggregate init, but its only for internal usage!
 		uint32_t lastBinding;
 		uint32_t lastElement;
@@ -1941,17 +1943,6 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			else
 				writes.back().count++;
 			lastElement = element;
-			// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
-			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
-			{
-				// not built yet?
-				if (depObj->)
-				{
-					const auto [where,inserted] = deferredTLASWrites.insert({binding.data,element,depObj});
-					assert(inserted);
-					return true;
-				}
-			}
 			//
 			auto& outInfo = infos.emplace_back();
 			outInfo.desc = std::move(depObj);
@@ -1962,10 +1953,18 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 				if (IDescriptor::GetTypeCategory(type)==IDescriptor::E_CATEGORY::EC_BUFFER)
 				{
 					//outInfo.info.buffer = std::get<0>(argTuple);
-					outInfo.info.buffer.offset= std::get<0>(argTuple).offset;
+					outInfo.info.buffer.offset = std::get<0>(argTuple).offset;
 					outInfo.info.buffer.size = std::get<0>(argTuple).size;
 				}
 			}
+			// mark potential TLAS rewrites (with compaction) so we don't have to scan entire descriptor set for potentially compacted TLASes
+			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
+			if (depObj->getPendingBuildVer()==0) // means not built yet, so compactable by next `convert` run
+			{
+				auto storageOffset = std::get<0>(argTuple);
+				storageOffset.data += element;
+				potentialTLASRewrites.push_back(storageOffset);
+			}
 			if constexpr (std::is_same_v<DepType,ICPUImageView>)
 			{
 				outInfo.info.image.imageLayout = std::get<0>(argTuple);
@@ -3366,7 +3365,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								ds = nullptr;
 							}
 							else
-								retval.m_deferredTLASDescriptorWrites[ds.get()] = std::move(visitor.deferredTLASWrites);
+							for (const auto storageIx : visitor.potentialTLASRewrites)
+								retval.m_potentialTLASRewrites.insert({ds.get(),storageIx});
 						}
 						else
 							inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName());
@@ -3453,9 +3453,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
                                 );
                                 continue;
                             }
-							// is there any reason for it to be more?
-							const uint32_t maxInstances = canonical->getInstances().size();
-							as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances});
+							as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances});
 						}
 						else
 							as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
@@ -3564,9 +3562,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							retval.m_tlasConversions[i].erase(entry.first);
 						if constexpr (std::is_same_v<AssetType,ICPUImage>)
 							retval.m_imageConversions.erase(entry.first);
-						// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
-						if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
-							retval.m_deferredTLASDescriptorWrites.erase(entry.first);
 						return true;
 					}
 					// still referenced, keep it around
@@ -3604,7 +3599,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				else
 				{
 					smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
-// TODO
+// TODO: figure out the BLAS that will be used, (this requires UUID)
 					retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}});
 				}
 			}
@@ -5231,30 +5226,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								depsMissing = missingDependent.template operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
 								break;
 							case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
-							{
-								const auto* tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc);
-								// successfully written a TLAS into the binding, nothing to check
-								if (tlas)
-									break;
-								// we have a null TLAS in the binding, and we have to check if we were supposed to have one in it
-								using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
-								const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
-								const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
-								const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
-								auto foundSet = reservations.m_deferredTLASDescriptorWrites.find(item.first);
-								if (foundSet!=reservations.m_deferredTLASDescriptorWrites.end())
-								{
-									const auto foundWrite = foundSet->second.find({
-										.binding = redirect.getBinding(bindingRange).data,
-										.arrayElement = i-firstElementOffset
-									});
-									// was scheduled to write some TLAS to this binding, but TLAS is now null
-									depsMissing = foundWrite!=foundSet->second.end() && !foundWrite->tlas;
-								}
-								else
-									depsMissing = true;
+								depsMissing = missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc));
 								break;
-							}
 							default:
 								assert(false);
 								depsMissing = true;
@@ -5305,53 +5278,45 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	mergeCache.template operator()<ICPURenderpass>();
 	mergeCache.template operator()<ICPUGraphicsPipeline>();
 	// write the TLASes into Descriptor Set finally
-	if (auto& tlasWriteDSMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteDSMap.empty())
+	if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty())
 	{
 		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
-		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
-		for (auto& tlasWriteMap : tlasWriteDSMap)
+		writes.reserve(tlasRewriteSet.size());
+		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(tlasRewriteSet.size());
+		auto* pInfo = infos.data();
+		for (auto& entry : tlasRewriteSet)
 		{
-			writes.clear();
-			infos.clear();
-			auto* dstSet = tlasWriteMap.first;
-			for (auto& inWrite : tlasWriteMap.second)
+			auto* const dstSet = entry.dstSet;
+			// we need to check if the descriptor set itself didn't get deleted in the meantime
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<ICPUDescriptorSet>>(reservations.m_stagingCaches);
+			const auto found = stagingCache.find(dstSet);
+			if (found==stagingCache.end())
+				continue;
+			// rewtrieve the binding from the TLAS
+			const auto* const tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get());
+			assert(tlas);
+			// only rewrite if successfully compacted
+			if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end())
 			{
-				// I know what I'm doing, this member has no influence on the set key hash or equal comparison operator
-				auto& tlas = const_cast<smart_refctd_ptr<IGPUTopLevelAccelerationStructure>&>(inWrite.tlas);
-				assert(tlas);
-				if (missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
-				{
-					tlas = {};
-					continue;
-				}
-				if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end())
-					tlas = foundCompacted->second;
-				infos.emplace_back().desc = std::move(tlas);
+				pInfo->desc = foundCompacted->second;
+				using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
+				const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
+				const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset);
+				const auto firstElementOffset = redirect.getStorageOffset(bindingRange);
 				writes.push_back({
 					.dstSet = dstSet,
-					.binding = inWrite.binding,
-					.arrayElement = inWrite.arrayElement,
-					.count = 1
+					.binding = redirect.getBinding(bindingRange).data,
+					.arrayElement = entry.storageOffset.data-firstElementOffset.data,
+					.count = 1,
+					.info = pInfo++
 				});
 			}
-			//
-			auto* pInfo = infos.data();
-			for (auto& outWrite : writes)
-				outWrite.info = pInfo++;
-			// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
-			if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
-			{
-				auto* pHash = findInStaging.template operator()<ICPUDescriptorSet>(dstSet);
-				smart_refctd_ptr<const ICPUDescriptorSet> dummy;
-				markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,dstSet,pHash);
-			}
 		}
-		// not strictly necessary, just provoking refcounting bugs right away if they exist
-		compactedTLASMap.clear();
+		// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
+		if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
+			logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR);
 	}
 	mergeCache.template operator()<ICPUDescriptorSet>();
-	// needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now
-	reservations.m_deferredTLASDescriptorWrites.clear();
 //	mergeCache.template operator()<ICPUFramebuffer>();
 
 	// no submit was necessary, so should signal the extra semaphores from the host

From 8c549fb7105637649b6b71ae2bb481b8729daa4a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 12 May 2025 17:40:48 +0700
Subject: [PATCH 062/346] Add computeDependants virtual function to IAsset

---
 include/nbl/asset/IAsset.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index 3b8b123ce3..3802536029 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -169,6 +169,8 @@ class IAsset : virtual public core::IReferenceCounted
 			return retval;
 		}
 
+		virtual core::unordered_set<const IAsset*> computeDependants() const = 0;
+
     virtual bool valid() const = 0;
 
     protected:

From 01c4ac66ad760c843853eb1dfb9bc18fbf6a4bd0 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 12 May 2025 17:41:22 +0700
Subject: [PATCH 063/346] Implement computeDependants for ICPUComputePipeline

---
 include/nbl/asset/ICPUComputePipeline.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 5f933878b4..aa7656af86 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -37,6 +37,11 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
         //!
         inline size_t getDependantCount() const override { return 2; }
 
+        virtual core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            return {m_layout.get(), m_specInfo.shader.get()};
+        }
+
         inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
         {
             if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable())

From d9efa1a60e17995271a966ffdc20d93f4490fa53 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 12 May 2025 17:41:56 +0700
Subject: [PATCH 064/346] Implement compute pipeline base

---
 include/nbl/asset/ICPUComputePipeline.h |  9 ++--
 include/nbl/asset/IComputePipeline.h    | 56 +++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 4 deletions(-)
 create mode 100644 include/nbl/asset/IComputePipeline.h

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index aa7656af86..01859e0c3f 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -6,15 +6,16 @@
 
 
 #include "nbl/asset/ICPUPipeline.h"
+#include "nbl/asset/IComputePipeline.h"
 
 
 namespace nbl::asset
 {
 
 //! CPU Version of Compute Pipeline
-class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayout>>
+class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipelineLayout>>
 {
-        using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>>;
+        using base_t = ICPUPipeline<IComputePipeline<ICPUPipelineLayout>>;
 
     public:
 
@@ -26,7 +27,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
 
         inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
-            auto newPipeline = new ICPUComputePipeline(std::move(layout));
+            auto newPipeline = new ICPUComputePipeline(layout.get());
             newPipeline->m_specInfo = m_specInfo.clone(depth);
             return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
@@ -73,7 +74,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IPipeline<ICPUPipelineLayo
         SShaderSpecInfo m_specInfo;
 
         explicit ICPUComputePipeline(const ICPUPipelineLayout* layout):
-          base_t(core::smart_refctd_ptr<ICPUPipelineLayout>(layout))
+          base_t(layout, {})
           {}
 
 };
diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h
new file mode 100644
index 0000000000..4f439d7100
--- /dev/null
+++ b/include/nbl/asset/IComputePipeline.h
@@ -0,0 +1,56 @@
+#ifndef _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_
+#define _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IPipeline.h"
+
+namespace nbl::asset
+{
+
+class IComputePipelineBase : public virtual core::IReferenceCounted
+{
+  public:
+    // Nabla requires device's reported subgroup size to be between 4 and 128
+    enum class SUBGROUP_SIZE : uint8_t
+    {
+      // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
+      UNKNOWN = 0,
+      // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
+      VARYING = 1,
+      // The rest we encode as log2(x) of the required value
+      REQUIRE_4 = 2,
+      REQUIRE_8 = 3,
+      REQUIRE_16 = 4,
+      REQUIRE_32 = 5,
+      REQUIRE_64 = 6,
+      REQUIRE_128 = 7
+    };
+
+    struct SCachedCreationParams final
+    {
+        SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+        uint8_t requireFullSubgroups : 1 = false;
+    };
+};
+
+template<typename PipelineLayoutType>
+class IComputePipeline : public IPipeline<PipelineLayoutType>, public IComputePipelineBase
+{
+    using base_creation_params_t = IPipeline<PipelineLayoutType>;
+
+  public:
+
+    inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
+
+  protected:
+    explicit IComputePipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
+        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)),
+        m_params(cachedParams)
+    {}
+
+    SCachedCreationParams m_params;
+
+};
+
+}
+
+#endif

From 4d5097b81eb79dd71ccf630696921a715babeaa8 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 12 May 2025 16:08:04 +0200
Subject: [PATCH 065/346] finish the Acceleration Structure
 `CAssetConverter::reserve`

---
 include/nbl/video/utilities/CAssetConverter.h |  32 ++--
 src/nbl/video/utilities/CAssetConverter.cpp   | 138 +++++++++---------
 2 files changed, 84 insertions(+), 86 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index f7faa9598b..d9ace6226e 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -1100,32 +1100,30 @@ class CAssetConverter : public core::IReferenceCounted
 					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
 
 					core::smart_refctd_ptr<const CPUAccelerationStructure> canonical = nullptr;
-					uint64_t scratchSize : 45;
-					uint64_t compact : 1;
+					uint64_t scratchSize : 47 = 0;
 					uint64_t buildFlags : 16 = 0;
+					uint64_t compact : 1;
 					// scratch + input size also accounting for worst case padding due to alignment
 					uint64_t buildSize;
 				};
-				template<typename CPUAccelerationStructure>
-				using SConvReqAccelerationStructureMap = core::unordered_map<typename asset_traits<CPUAccelerationStructure>::video_t*,SConvReqAccelerationStructure<CPUAccelerationStructure>>;
-				SConvReqAccelerationStructureMap<asset::ICPUBottomLevelAccelerationStructure> m_blasConversions[2];
-				SConvReqAccelerationStructureMap<asset::ICPUTopLevelAccelerationStructure> m_tlasConversions[2];
+				using SConvReqBLASMap = core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>>;
+				SConvReqBLASMap m_blasConversions[2];
+				struct SConvReqTLAS : SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>
+				{
+					// This tracks non-root BLASes which are needed for a subsequent TLAS build.
+					// Because the copy group ID of the BLAS can only depend on the copy group and pointer of the TLAS and BLAS,
+					// we can be sure that all instances of the same BLAS within a TLAS will have the same copy group ID and use a map instead of a vector for storage
+					// Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes prematurely.
+					using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>;
+					cpu_to_gpu_blas_map_t instanceMap;
+				};
+				using SConvReqTLASMap = core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS>;
+				SConvReqTLASMap m_tlasConversions[2];
 
 				// array index 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
 				uint64_t m_maxASBuildScratchSize[2] = {0,0};
 				uint64_t m_compactedASMaxMemory = 0;
-				// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
-				struct BLASUsedInTLASBuild
-				{
-					// This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve`
-					core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
-					uint64_t buildDuringConvertCall : 1 = false;
-					// internal micro-refcount which lets us know when we should remove the entry from the map below
-					uint64_t remainingUsages : 63 = 0;
-				};
-				using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
-				cpu_to_gpu_blas_map_t m_blasBuildMap;
 				//
 				struct SDeferredTLASWrite
 				{
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 733be3f058..7bfd361e94 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -445,7 +445,7 @@ class AssetVisitor : public CRTP
 		}
 
 	private:
-		// there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage`
+		// there is no `impl()` overload taking `ICPUBottomLevelAccelerationStructure` same as there is no `ICPUmage`
 		inline bool impl(const instance_t<ICPUTopLevelAccelerationStructure>& instance, const CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure>& userPatch)
 		{
 			const auto blasInstances = instance.asset->getInstances();
@@ -1656,6 +1656,9 @@ class GetDependantVisit;
 template<>
 class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
 {
+	public:
+		CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap;
+
 	protected:
 		bool descend_impl(
 			const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
@@ -1666,6 +1669,7 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 			auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
 			if (!depObj)
 				return false;
+			instanceMap->operator[](dep.asset) = std::move(depObj);
 			return true;
 		}
 };
@@ -3397,9 +3401,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			// now allocate the memory for buffers and images
 			deferredAllocator.finalize();
 
-			// TODO: everything below is slightly wrong due to not having a final top-down dependency checking pass throwing away useless non-root GPU subtrees
-
-			// find out which buffers need to be uploaded via a staging buffer
+			// enqueue successfully created buffers for conversion
 			for (auto& entry : bufferConversions.contentHashToCanonical)
 			for (auto i=0ull; i<entry.second.copyCount; i++)
 			if (auto& gpuBuff=bufferConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuBuff)
@@ -3414,7 +3416,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				{
 					constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
 					//
-					SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
+					std::conditional_t<IsTLAS,SReserveResult::SConvReqTLASMap,SReserveResult::SConvReqBLASMap>* pConversions;
 					if constexpr (IsTLAS)
 						pConversions = retval.m_tlasConversions;
 					else
@@ -3437,11 +3439,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							};
 						}
 						smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
+						CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap;
 						if constexpr (IsTLAS)
 						{
 							// check if the BLASes we want to use for the instances were successfully allocated and created
 							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
-								{inputs,dfsCaches},
+								{inputs,dfsCaches,&blasInstanceMap},
 								{canonical,deferredParams.uniqueCopyGroupID},
 								patch
 							};
@@ -3469,6 +3472,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						request.compact = patch.compactAfterBuild;
 						request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
 						request.buildSize = deferredParams.buildSize;
+						if constexpr (IsTLAS)
+							request.instanceMap = std::move(blasInstanceMap);
 					}
 				};
 				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
@@ -3476,7 +3481,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
 				tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
 			}
-			// find out which images need what caps for the transfer and mipmapping
+			// enqueue successfully created images with data to upload for conversion
 			auto& dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
 			for (auto& entry : imageConversions.contentHashToCanonical)
 			for (auto i=0ull; i<entry.second.copyCount; i++)
@@ -3584,26 +3589,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		pruneStaging.template operator()<ICPUBufferView>();
 		pruneStaging.template operator()<ICPUImage>();
 		pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
-		// go over future TLAS builds to gather used BLASes
-		for (auto i=0; i<2; i++)
-		for (const auto& req : retval.m_tlasConversions[i])
-		{
-			auto* const cpuTLAS = req.second.canonical.get();
-			assert(cpuTLAS);
-			for (const auto& instance : cpuTLAS->getInstances())
-			{
-				auto* const cpuBLAS = instance.getBase().blas.get();
-				auto foundBLAS = retval.m_blasBuildMap.find(cpuBLAS);
-				if (foundBLAS!=retval.m_blasBuildMap.end())
-					foundBLAS->second.remainingUsages++;
-				else
-				{
-					smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
-// TODO: figure out the BLAS that will be used, (this requires UUID)
-					retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}});
-				}
-			}
-		}
 		pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
 		pruneStaging.template operator()<ICPUBuffer>();
 	}
@@ -3620,7 +3605,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			// index 0 is device build, 1 is host build
 			size_t scratchSizeFullParallelBuild[2] = {0,0};
 			//
-			const SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
+			const std::conditional_t<IsTLAS,SReserveResult::SConvReqTLASMap,SReserveResult::SConvReqBLASMap>* pConversions;
 			if constexpr (IsTLAS)
 				pConversions = retval.m_tlasConversions;
 			else
@@ -3755,7 +3740,25 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		}
 	};
 
-	// compacted TLASes need to be substituted in cache and Descriptor Sets
+	// want to check if deps successfully exist
+	struct SMissingDependent
+	{
+		// This only checks if whether we had to convert and failed, but the dependent might be in readCache of one or more converters, so if in doubt assume its okay
+		inline operator bool() const {return wasInStaging && gotWiped;}
+
+		bool wasInStaging;
+		bool gotWiped;
+	};
+	auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->SMissingDependent
+	{
+		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+		auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(dep));
+		SMissingDependent retval = {.wasInStaging=found!=stagingCache.end()};
+		retval.gotWiped = retval.wasInStaging && found->second.value==CHashCache::NoContentHash;
+		return retval;
+	};
+
+	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
 	core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
 	// Anything to do?
 	auto reqQueueFlags = reservations.m_queueFlags;
@@ -4672,6 +4675,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
 			};
 
+			// compacted BLASes need to be substituted in cache and TLAS Build Inputs
+			using compacted_blas_map_t = core::unordered_map<const IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>>;
+			compacted_blas_map_t compactedBLASMap;
 			// Device BLAS builds
 			if (blasCount)
 			{
@@ -4749,7 +4755,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				computeCmdBuf->cmdbuf->endDebugMarker();
 				{
 					// the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP
-//reservations.m_blasBuildMap[canonical].gpuBLAS = compacted;
+//compactedBLASMap[as] = compacted;
 				}
 				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes END");
 				computeCmdBuf->cmdbuf->endDebugMarker();
@@ -4801,11 +4807,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
 					using addr_t = typename scratch_allocator_t::size_type;
 					const auto& limits = physDev->getLimits();
-					core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> dedupBLASesUsed;
-					dedupBLASesUsed.reserve(reservations.m_blasBuildMap.size());
 					for (auto& tlasToBuild : tlasesToBuild)
 					{
-						dedupBLASesUsed.clear();
 						auto& canonical = tlasToBuild.second.canonical;
 						const auto as = tlasToBuild.first;
 						const auto pFoundHash = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(as);
@@ -4819,19 +4822,30 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						}
 						const auto instances = canonical->getInstances();
 						const auto instanceCount = static_cast<uint32_t>(instances.size());
+						const auto& instanceMap = tlasToBuild.second.instanceMap;
 						size_t instanceDataSize = 0;
 						// gather total input size and check dependants exist
+						bool dependsOnBLASBuilds = false;
 						for (const auto& instance : instances)
 						{
-							// failed BLAS builds erase themselves from this map, so this checks if some BLAS used but which had to be built failed the build
-							const auto found = reservations.m_blasBuildMap.find(instance.getBase().blas.get());
-							if (found==reservations.m_blasBuildMap.end() || failedBLASBarrier && found->second.buildDuringConvertCall)
+							auto found = instanceMap.find(instance.getBase().blas.get());
+							assert(instanceMap.end()!=found);
+							const auto depInfo = missingDependent.template operator()<ICPUBottomLevelAccelerationStructure>(found->second.get());
+							if (depInfo)
 							{
 								instanceDataSize = 0;
 								break;
 							}
+							if (depInfo.wasInStaging)
+								dependsOnBLASBuilds;
 							instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
 						}
+						// problem with building some Dependent BLASes
+						if (failedBLASBarrier && dependsOnBLASBuilds)
+						{
+							markFailureInStaging("building BLASes which current TLAS build wants to instance",canonical,as,pFoundHash);
+							continue;
+						}
 						// problem with finding the dependents (BLASes)
 						if (instanceDataSize==0)
 						{
@@ -4862,6 +4876,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
 						}
 						// stream the instance/geometry input in
+						const size_t trackedBLASesOffset = trackedBLASes.size();
 						{
 							bool success = true;
 							{
@@ -4881,27 +4896,30 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 											const auto newWritten = bytesWritten+size;
 											if (newWritten>=blockSize)
 												return bytesWritten;
-											auto found = blasBuildMap->find(instance.getBase().blas.get());
-											assert(found!=blasBuildMap->end());
-											const auto& blas = found->second.gpuBLAS;
-											dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations());
-											dedupBLASesUsed->emplace(blas);
-											if (--found->second.remainingUsages == 0)
-												blasBuildMap->erase(found);
+											auto found = instanceMap->find(instance.getBase().blas.get());
+											auto blas = found->second.get();
+											if (auto found=compactedBLASMap->find(blas); found!=compactedBLASMap->end())
+												blas = found->second.get();
+											trackedBLASes->emplace_back(blas);
+											dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas->getReferenceForDeviceOperations());
 											bytesWritten = newWritten;
 										}
 									}
 
-									SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
-									core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* dedupBLASesUsed;
+									const compacted_blas_map_t* compactedBLASMap;
+									core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* trackedBLASes;
+									SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap;
 									std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
 									uint32_t instanceIndex = 0;
 								};
 								FillInstances fillInstances;
-								fillInstances.blasBuildMap = &reservations.m_blasBuildMap;
-								fillInstances.dedupBLASesUsed = &dedupBLASesUsed;
+								fillInstances.compactedBLASMap = &compactedBLASMap;
+								fillInstances.trackedBLASes = &trackedBLASes;
+								fillInstances.instanceMap = &tlasToBuild.second.instanceMap;
 								fillInstances.instances = instances;
 								success = streamDataToScratch(offsets[1],sizes[1],fillInstances);
+								// provoke refcounting bugs right away
+								tlasToBuild.second.instanceMap.clear();
 							}
 							if (success && as->usesMotion())
 							{
@@ -4935,6 +4953,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							xferCmdBuf = params.transfer->getCommandBufferForRecording();
 							if (!success)
 							{
+								trackedBLASes.resize(trackedBLASesOffset);
 								markFailureInStaging("Uploading Instance Data for TLAS build failed",canonical,as,pFoundHash);
 								continue;
 							}
@@ -4950,14 +4969,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
 						buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
 						// be based cause vectors can grow
-						{
-							const auto offset = trackedBLASes.size();
-							using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
-							buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),dedupBLASesUsed.size()};
-							for (auto& blas : dedupBLASesUsed)
-								trackedBLASes.emplace_back(std::move(blas));
-
-						}
+						using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
+						buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset};
 						// no special extra byte offset into the instance buffer
 						rangeInfos.emplace_back(instanceCount,0u);
 						//
@@ -4984,7 +4997,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						else
 							compactedOwnershipReleaseIndices.push_back(~0u);
 					}
-					reservations.m_blasBuildMap.clear();
 					// finish the last batch
 					recordBuildCommands();
 					if (!flushRanges.empty())
@@ -5154,18 +5166,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	// finish host tasks if not done yet
 	hostUploadBuffers([]()->bool{return true;});
 
-	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
-	// want to check if deps successfully exist
-	auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->bool
-	{
-		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-		auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(dep));
-		// this only checks if whether we had to convert and failed
-		if (found!=stagingCache.end() && found->second.value==CHashCache::NoContentHash)
-			return true;
-		// but the dependent might be in readCache of one or more converters, so if in doubt assume its okay
-		return false;
-	};
 	// insert items into cache if overflows handled fine and commandbuffers ready to be recorded
 	auto mergeCache = [&]<Asset AssetType>()->void
 	{
@@ -5277,7 +5277,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	mergeCache.template operator()<ICPUComputePipeline>();
 	mergeCache.template operator()<ICPURenderpass>();
 	mergeCache.template operator()<ICPUGraphicsPipeline>();
-	// write the TLASes into Descriptor Set finally
+	// overwrite the compacted TLASes in Descriptor Sets
 	if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty())
 	{
 		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;

From 09f16c2b36335cb7044d7935054fdb24e71f9263 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 28 Apr 2025 10:54:49 +0700
Subject: [PATCH 066/346] minor fixes, example

---
 examples_tests                                |   2 +-
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   |  36 +++++
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 125 ++++++++++++++++++
 3 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100644 include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
 create mode 100644 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl

diff --git a/examples_tests b/examples_tests
index 8c76367c1c..20011f5fdd 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb
+Subproject commit 20011f5fdd3e8454bb830ded6f4221ec75036809
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
new file mode 100644
index 0000000000..dcd2a5df5d
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -0,0 +1,36 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
+
+
+#include "nbl/builtin/hlsl/functional.hlsl"
+#include "nbl/builtin/hlsl/workgroup/ballot.hlsl"
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
+
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+template<class Config, class BinOp, class device_capabilities=void>
+struct reduction
+{
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::reduce<Config,BinOp,device_capabilities> fn;
+        fn.__call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
new file mode 100644
index 0000000000..9c2eb164cf
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -0,0 +1,125 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup/ballot.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+
+namespace nbl 
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+template<uint32_t _WorkgroupSize, uint32_t _SubgroupSizeLog2, uint32_t _ItemsPerInvocation>
+struct Configuration
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation);
+
+    // must have at least enough level 0 outputs to feed a single subgroup
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = hlsl::max(WorkgroupSize >> SubgroupSizeLog2, SubgroupSize);
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation[2] = { Config::ItemsPerInvocation, SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2 };
+    static_assert(ItemsPerInvocation[1]<=4, "3 level scan would have been needed with this config!");
+};
+
+namespace impl
+{
+
+template<class Config, class BinOp, class device_capabilities>
+struct reduce
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation[0]>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation[1]>;   // scratch smem accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)   // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup]
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, typename BinOp::base_t, Config::ItemsPerInvocation[0], device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, typename BinOp::base_t, Config::ItemsPerInvocation[1], device_capabilities>;
+        BinOp binop;
+
+        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
+        const uint32_t invocationIndex = SubgroupContiguousIndex();
+        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        // level 0 scan
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            scan_local[idx] = inclusiveScan0(dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex));
+            if (subgroup::ElectLast())
+            {
+                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation[0]-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        // level 1 scan
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex)));
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // set as last element in scan (reduction)
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupsPerVirtualWorkgroup-1));
+        }
+    }
+};
+
+template<class Config, class BinOp, uint16_t ItemCount, bool Exclusive, class device_capabilities>
+struct scan
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation[0]>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation[1]>;   // scratch smem accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)   // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup]
+    {
+        // TODO get this working
+        // same thing for level 0
+
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        // level 1 scan
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex));
+            scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput));
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx]));
+        }
+    }
+};
+
+}
+
+}
+}
+}
+
+#endif

From 6f5f8b05bc33cc8ea848d3f003bc7218a2d6bbac Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 28 Apr 2025 17:03:39 +0700
Subject: [PATCH 067/346] bug fixes and example

---
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   |  4 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 69 ++++++++++---------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index dcd2a5df5d..2753344e43 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -25,9 +25,9 @@ struct reduction
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::reduce<Config,BinOp,device_capabilities> fn;
-        fn.__call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
     }
-}
+};
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 9c2eb164cf..7be002e8d3 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -9,6 +9,7 @@
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup/ballot.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/mpl.hlsl"
 
 namespace nbl 
 {
@@ -23,13 +24,15 @@ struct Configuration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize);
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2);
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation);
+    // NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation);
 
     // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = hlsl::max(WorkgroupSize >> SubgroupSizeLog2, SubgroupSize);
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max<uint32_t, (WorkgroupSize >> SubgroupSizeLog2), SubgroupSize>::value; //TODO expression not constant apparently
     NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation[2] = { Config::ItemsPerInvocation, SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2 };
-    static_assert(ItemsPerInvocation[1]<=4, "3 level scan would have been needed with this config!");
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2;
+    static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 };
 
 namespace impl
@@ -39,19 +42,19 @@ template<class Config, class BinOp, class device_capabilities>
 struct reduce
 {
     using scalar_t = typename BinOp::type_t;
-    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation[0]>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation[1]>;   // scratch smem accessor needs to be this type
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)   // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup]
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, typename BinOp::base_t, Config::ItemsPerInvocation[0], device_capabilities>;
-        using params_lv1_t = subgroup2::ArithmeticParams<config_t, typename BinOp::base_t, Config::ItemsPerInvocation[1], device_capabilities>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
         vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
-        const uint32_t invocationIndex = SubgroupContiguousIndex();
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
         // level 0 scan
         [unroll]
@@ -61,7 +64,7 @@ struct reduce
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation[0]-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -88,31 +91,35 @@ template<class Config, class BinOp, uint16_t ItemCount, bool Exclusive, class de
 struct scan
 {
     using scalar_t = typename BinOp::type_t;
-    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation[0]>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation[1]>;   // scratch smem accessor needs to be this type
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)   // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup]
     {
-        // TODO get this working
-        // same thing for level 0
-
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        // level 1 scan
-        if (glsl::gl_SubgroupID() == 0)
-        {
-            const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex));
-            scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput));
-        }
-        scratchAccessor.workgroupExecutionAndMemoryBarrier();
-
-        // combine with level 0
-        [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        {
-            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx]));
-        }
+        // // TODO get this working
+        // // same thing for level 0
+        // using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        // using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        // using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        // BinOp binop;
+
+        // subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        // // level 1 scan
+        // if (glsl::gl_SubgroupID() == 0)
+        // {
+        //     const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex));
+        //     scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput));
+        // }
+        // scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // // combine with level 0
+        // [unroll]
+        // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        // {
+        //     const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+        //     dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx]));
+        // }
     }
 };
 

From 1bac2478f5f09c05b45fa625c70da6ca44023970 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 29 Apr 2025 12:05:04 +0700
Subject: [PATCH 068/346] fix to data accessor indexing

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 7be002e8d3..3cba3a2d57 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -60,7 +60,7 @@ struct reduce
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            scan_local[idx] = inclusiveScan0(dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex));
+            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
@@ -70,6 +70,7 @@ struct reduce
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        // subgroup2::reduction<params_lv1_t> reduce1;
         // level 1 scan
         if (glsl::gl_SubgroupID() == 0)
         {
@@ -81,8 +82,8 @@ struct reduce
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupsPerVirtualWorkgroup-1));
+            // const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupSize-1));
         }
     }
 };

From 305ac7bd3997f7b491ff9adb30a8f9c8e54ab5ca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 29 Apr 2025 16:58:04 +0700
Subject: [PATCH 069/346] added template spec for vector dim 1

---
 include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
index 9aefc3b3d8..652cabd7c7 100644
--- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
+++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
@@ -28,6 +28,7 @@ struct vector_traits<vector<T, DIMENSION> >\
     NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
 };\
 
+DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4)

From c08063da62a3bed85cb4ff9d59668ed7474604f7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 29 Apr 2025 17:03:13 +0700
Subject: [PATCH 070/346] added inclusive scan

---
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   | 11 +++
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 77 +++++++++++--------
 2 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index 2753344e43..acfa5feba8 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -29,6 +29,17 @@ struct reduction
     }
 };
 
+template<class Config, class BinOp, class device_capabilities=void>
+struct inclusive_scan
+{
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::scan<Config,BinOp,false,device_capabilities> fn;
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 3cba3a2d57..6358bf24ad 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -24,7 +24,6 @@ struct Configuration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize);
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2);
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
-    // NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation);
 
     // must have at least enough level 0 outputs to feed a single subgroup
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max<uint32_t, (WorkgroupSize >> SubgroupSizeLog2), SubgroupSize>::value; //TODO expression not constant apparently
@@ -46,7 +45,7 @@ struct reduce
     using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
 
     template<class DataAccessor, class ScratchAccessor>
-    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)   // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup]
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
@@ -55,8 +54,8 @@ struct reduce
 
         vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
-        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
         // level 0 scan
+        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
@@ -69,9 +68,8 @@ struct reduce
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        // subgroup2::reduction<params_lv1_t> reduce1;
         // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() == 0)
         {
             scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex)));
@@ -82,13 +80,12 @@ struct reduce
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            // const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
             dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupSize-1));
         }
     }
 };
 
-template<class Config, class BinOp, uint16_t ItemCount, bool Exclusive, class device_capabilities>
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
 struct scan
 {
     using scalar_t = typename BinOp::type_t;
@@ -96,31 +93,49 @@ struct scan
     using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
 
     template<class DataAccessor, class ScratchAccessor>
-    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)   // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup]
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        // // TODO get this working
-        // // same thing for level 0
-        // using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        // using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
-        // using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
-        // BinOp binop;
-
-        // subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        // // level 1 scan
-        // if (glsl::gl_SubgroupID() == 0)
-        // {
-        //     const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex));
-        //     scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput));
-        // }
-        // scratchAccessor.workgroupExecutionAndMemoryBarrier();
-
-        // // combine with level 0
-        // [unroll]
-        // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        // {
-        //     const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-        //     dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx]));
-        // }
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        // level 0 scan
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            if (subgroup::ElectLast())
+            {
+                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            const vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), scratchAccessor.get(invocationIndex-1), bool(invocationIndex));
+            scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput));
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            const vector_lv1_t lhs = scratchAccessor.get(virtualSubgroupID);
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                scan_local[idx][i] = binop(lhs, scan_local[idx][i]);
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+        }
     }
 };
 

From b1d804f520eed03d72a1d625bb904e777a34b23a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 30 Apr 2025 14:08:38 +0700
Subject: [PATCH 071/346] exclusive scan working

---
 .../builtin/hlsl/workgroup2/arithmetic.hlsl    | 11 +++++++++++
 .../builtin/hlsl/workgroup2/shared_scan.hlsl   | 18 ++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index acfa5feba8..6824e92afa 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -40,6 +40,17 @@ struct inclusive_scan
     }
 };
 
+template<class Config, class BinOp, class device_capabilities=void>
+struct exclusive_scan
+{
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::scan<Config,BinOp,true,device_capabilities> fn;
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 6358bf24ad..331951d3f3 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -130,10 +130,20 @@ struct scan
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-            const vector_lv1_t lhs = scratchAccessor.get(virtualSubgroupID);
-            [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                scan_local[idx][i] = binop(lhs, scan_local[idx][i]);
+            const vector_lv1_t left = scratchAccessor.get(virtualSubgroupID);
+            if (Exclusive)
+            {
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                [unroll]
+                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
+            }
+            else
+            {
+                [unroll]
+                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    scan_local[idx][i] = binop(left, scan_local[idx][i]);
+            }
             dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
         }
     }

From 3cf98ab4abe77fecd7a779d58c7f85c42d85251e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 30 Apr 2025 14:12:55 +0700
Subject: [PATCH 072/346] removed outdated comment

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 331951d3f3..cd49cb1c1b 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -26,7 +26,7 @@ struct Configuration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
 
     // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max<uint32_t, (WorkgroupSize >> SubgroupSizeLog2), SubgroupSize>::value; //TODO expression not constant apparently
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max<uint32_t, (WorkgroupSize >> SubgroupSizeLog2), SubgroupSize>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2;
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation;

From 7b310e01f9c4c557dec87555121c3ee7cebed456 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 1 May 2025 12:18:35 +0700
Subject: [PATCH 073/346] minor changes to config usage

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index cd49cb1c1b..c789c8a482 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -18,19 +18,20 @@ namespace hlsl
 namespace workgroup2
 {
 
-template<uint32_t _WorkgroupSize, uint32_t _SubgroupSizeLog2, uint32_t _ItemsPerInvocation>
+template<uint32_t WorkgroupSizeLog2, uint32_t _SubgroupSizeLog2, uint32_t _ItemsPerInvocation>
 struct Configuration
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2);
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
+    static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
 
     // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max<uint32_t, (WorkgroupSize >> SubgroupSizeLog2), SubgroupSize>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max<uint32_t, WorkgroupSizeLog2, 2*SubgroupSizeLog2>::value - SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 + SubgroupSizeLog2);
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 - SubgroupSizeLog2);
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 };
 

From 4b4e7e8f3685f4a825997ba7a3ea5fc2594883f4 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 1 May 2025 17:19:13 +0700
Subject: [PATCH 074/346] add 1 level scans

---
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   |  6 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 69 ++++++++++++++++++-
 2 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index 6824e92afa..3b4a028d2c 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -24,7 +24,7 @@ struct reduction
     template<class DataAccessor, class ScratchAccessor>
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        impl::reduce<Config,BinOp,device_capabilities> fn;
+        impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
         fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
     }
 };
@@ -35,7 +35,7 @@ struct inclusive_scan
     template<class DataAccessor, class ScratchAccessor>
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        impl::scan<Config,BinOp,false,device_capabilities> fn;
+        impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
         fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
     }
 };
@@ -46,7 +46,7 @@ struct exclusive_scan
     template<class DataAccessor, class ScratchAccessor>
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        impl::scan<Config,BinOp,true,device_capabilities> fn;
+        impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;
         fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
     }
 };
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index c789c8a482..c18c00f83e 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -26,11 +26,13 @@ struct Configuration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
     static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
 
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = conditional_value<WorkgroupSize <= 4*SubgroupSize,uint16_t,1,2>::value;
+
     // must have at least enough level 0 outputs to feed a single subgroup
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max<uint32_t, WorkgroupSizeLog2, 2*SubgroupSizeLog2>::value - SubgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 + SubgroupSizeLog2);
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = conditional_value<LevelCount==1,uint32_t,uint32_t(0x1u)<<(WorkgroupSizeLog2-SubgroupSizeLog2),_ItemsPerInvocation>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 - SubgroupSizeLog2);
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 };
@@ -38,8 +40,69 @@ struct Configuration
 namespace impl
 {
 
+template<class Config, class BinOp, uint16_t LevelCount, class device_capabilities>
+struct reduce;
+
+template<class Config, class BinOp, bool Exclusive, uint16_t LevelCount, class device_capabilities>
+struct scan;
+
+// 1-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 1, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    // doesn't use scratch smem, need as param?
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        subgroup2::reduction<params_t> reduction;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()));
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
+        }
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    // doesn't use scratch smem, need as param?
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_t value;
+            if (Exclusive)
+            {
+                subgroup2::exclusive_scan<params_t> excl_scan;
+                value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()));
+            }
+            else
+            {
+                subgroup2::inclusive_scan<params_t> incl_scan;
+                value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()));
+            }
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
+        }
+    }
+};
+
+// 2-level scans
 template<class Config, class BinOp, class device_capabilities>
-struct reduce
+struct reduce<Config, BinOp, 2, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
@@ -87,7 +150,7 @@ struct reduce
 };
 
 template<class Config, class BinOp, bool Exclusive, class device_capabilities>
-struct scan
+struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type

From 2e5f29f10e53f1f8632e8f45099cece1e4b72601 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 2 May 2025 09:41:52 +0700
Subject: [PATCH 075/346] fixes to 1 level scans

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index c18c00f83e..0128c3320d 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -63,8 +63,8 @@ struct reduce<Config, BinOp, 1, device_capabilities>
         subgroup2::reduction<params_t> reduction;
         if (glsl::gl_SubgroupID() == 0)
         {
-            vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()));
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
+            vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex()));
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
         }
     }
 };
@@ -88,14 +88,14 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
             if (Exclusive)
             {
                 subgroup2::exclusive_scan<params_t> excl_scan;
-                value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()));
+                value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex()));
             }
             else
             {
                 subgroup2::inclusive_scan<params_t> incl_scan;
-                value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()));
+                value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex()));
             }
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
         }
     }
 };

From 054b26916204d3ece92e474cb87ec74ebdead9bb Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 2 May 2025 10:54:33 +0700
Subject: [PATCH 076/346] added handling >1 vectors on level 1 scan (untested)

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 0128c3320d..b32bc3efde 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -127,7 +127,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -144,7 +144,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupSize-1));
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.getByComponent((1u << Config::SubgroupsPerVirtualWorkgroupLog2)-1));
         }
     }
 };
@@ -175,7 +175,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -194,7 +194,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-            const vector_lv1_t left = scratchAccessor.get(virtualSubgroupID);
+            const scalar_t left = scratchAccessor.getByComponent(virtualSubgroupID);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));

From 1b5282c8b5c37a3d387ec89ce2c2ea12384c41b7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 5 May 2025 17:16:12 +0700
Subject: [PATCH 077/346] move load/store smem into scan funcs, setup config
 for 3 levels

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 200 +++++++++++++++++-
 1 file changed, 191 insertions(+), 9 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index b32bc3efde..c88694d1ac 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -18,6 +18,25 @@ namespace hlsl
 namespace workgroup2
 {
 
+namespace impl
+{
+template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
+struct virtual_wg_size_log2
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2+2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
+};
+
+template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
+struct items_per_invocation
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = conditional_value<VirtualWorkgroup::levels==1,uint16_t,uint16_t(0x1u)<<(WorkgroupSizeLog2-SubgroupSizeLog2),BaseItemsPerInvocation>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
+};
+}
+
 template<uint32_t WorkgroupSizeLog2, uint32_t _SubgroupSizeLog2, uint32_t _ItemsPerInvocation>
 struct Configuration
 {
@@ -26,17 +45,43 @@ struct Configuration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
     static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = conditional_value<WorkgroupSize <= 4*SubgroupSize,uint16_t,1,2>::value;
-
     // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max<uint32_t, WorkgroupSizeLog2, 2*SubgroupSizeLog2>::value - SubgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 + SubgroupSizeLog2);
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+
+    using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
+    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>;
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = conditional_value<LevelCount==1,uint32_t,uint32_t(0x1u)<<(WorkgroupSizeLog2-SubgroupSizeLog2),_ItemsPerInvocation>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 - SubgroupSizeLog2);
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 };
 
+// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096
+// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
+#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\
+struct Configuration<11, 4, ITEMS_PER_INVOC>\
+{\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 128u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = 1u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = 1u;\
+};\
+
+SPECIALIZE_CONFIG_CASE_2048_16(1)
+SPECIALIZE_CONFIG_CASE_2048_16(2)
+SPECIALIZE_CONFIG_CASE_2048_16(4)
+
+#undef SPECIALIZE_CONFIG_CASE_2048_16
+
+
 namespace impl
 {
 
@@ -127,7 +172,62 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.get(invocationIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // set as last element in scan (reduction)
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            scalar_t reduce_val;
+            scratchAccessor.get(Config::SubgroupSize-1,reduce_val);
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
+        }
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        // level 0 scan
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            if (subgroup::ElectLast())
+            {
+                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -135,11 +235,93 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         // level 1 scan
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv1_t lv1_val;
+            const uint32_t prevIndex = invocationIndex-1;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.get(prevIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]);
+            vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), lv1_val, bool(invocationIndex));
+            shiftedInput = inclusiveScan1(shiftedInput);
+            scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            scalar_t left;
+            scratchAccessor.get(virtualSubgroupID,left);
+            if (Exclusive)
+            {
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                [unroll]
+                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
+            }
+            else
+            {
+                [unroll]
+                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    scan_local[idx][i] = binop(left, scan_local[idx][i]);
+            }
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+        }
+    }
+};
+
+// 2-level scans
+/*
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 3, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 0 scan
+        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        [unroll]
+        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            if (subgroup::ElectLast())
+            {
+                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+                scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1)
         {
             scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex)));
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
+        // level 2 scan
+        // TODO
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan2;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            scratchAccessor.set(invocationIndex, inclusiveScan2(scratchAccessor.get(invocationIndex)));
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
         // set as last element in scan (reduction)
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
@@ -150,7 +332,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
 };
 
 template<class Config, class BinOp, bool Exclusive, class device_capabilities>
-struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
+struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
@@ -212,7 +394,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         }
     }
 };
-
+*/
 }
 
 }

From c6dc5bc9579877d03f2e1e5531ef527cdd1b4eda Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 6 May 2025 10:52:05 +0700
Subject: [PATCH 078/346] change to use coalesced indexing for 2-level scans

---
 .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl  | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index c88694d1ac..26fb969ace 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -172,7 +172,8 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -184,7 +185,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(invocationIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
             lv1_val = inclusiveScan1(lv1_val);
             scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
@@ -227,7 +228,8 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -240,7 +242,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(prevIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv1_val[i]);
             vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), lv1_val, bool(invocationIndex));
             shiftedInput = inclusiveScan1(shiftedInput);
             scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]);
@@ -272,8 +274,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
     }
 };
 
-// 2-level scans
-/*
+// 3-level scans
 template<class Config, class BinOp, class device_capabilities>
 struct reduce<Config, BinOp, 3, device_capabilities>
 {
@@ -394,7 +395,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         }
     }
 };
-*/
+
 }
 
 }

From aa0c36c8b48f480325c74334fa2fb8400b1fc76e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 6 May 2025 14:35:02 +0700
Subject: [PATCH 079/346] added 3-level scans

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 69 +++++++++++++++----
 1 file changed, 56 insertions(+), 13 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 26fb969ace..91596bace0 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -151,7 +151,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -207,7 +207,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -280,7 +280,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -288,6 +289,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
         vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
@@ -301,7 +303,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -310,16 +313,29 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1)
         {
-            scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex)));
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            if (subgroup::ElectLast())
+            {
+                const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (invocationIndex/Config::ItemsPerInvocation_2);
+                scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+            }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 2 scan
-        // TODO
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan2;
+        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
         if (glsl::gl_SubgroupID() == 0)
         {
-            scratchAccessor.set(invocationIndex, inclusiveScan2(scratchAccessor.get(invocationIndex)));
+            vector_lv2_t lv2_val;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv2_val[i]);
+            lv2_val = inclusiveScan2(lv2_val);
+            scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -327,7 +343,9 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.getByComponent((1u << Config::SubgroupsPerVirtualWorkgroupLog2)-1));
+            scalar_t reduce_val;
+            scratchAccessor.get(Config::SubgroupSize-1,reduce_val);
+            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }
 };
@@ -358,17 +376,41 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 1 scan
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            if (subgroup::ElectLast())
+            {
+                const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 2 scan
+        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
         if (glsl::gl_SubgroupID() == 0)
         {
-            const vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), scratchAccessor.get(invocationIndex-1), bool(invocationIndex));
-            scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput));
+            vector_lv2_t lv2_val;
+            const uint32_t prevIndex = invocationIndex-1;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv2_val[i]);
+            vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(invocationIndex));
+            shiftedInput = inclusiveScan2(shiftedInput);
+            scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -377,7 +419,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-            const scalar_t left = scratchAccessor.getByComponent(virtualSubgroupID);
+            const scalar_t left;
+            scratchAccessor.get(virtualSubgroupID, left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));

From 74c359bed10f1a2d3d55b126863f3d962b87826d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 6 May 2025 14:41:01 +0700
Subject: [PATCH 080/346] minor bug fixes

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 91596bace0..141deccb7b 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -355,7 +355,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;   // scratch smem accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -363,6 +364,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
         vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];

From ce244e2d24d2da9e79197226799098aaa7675be9 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 7 May 2025 16:55:34 +0700
Subject: [PATCH 081/346] changes to data accessor usage

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 141deccb7b..057e9ebd24 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -108,7 +108,9 @@ struct reduce<Config, BinOp, 1, device_capabilities>
         subgroup2::reduction<params_t> reduction;
         if (glsl::gl_SubgroupID() == 0)
         {
-            vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex()));
+            vector_t value;
+            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
+            value = reduction(value);
             dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
         }
     }
@@ -130,15 +132,16 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_t value;
+            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
             if (Exclusive)
             {
                 subgroup2::exclusive_scan<params_t> excl_scan;
-                value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex()));
+                value = excl_scan(value);
             }
             else
             {
                 subgroup2::inclusive_scan<params_t> incl_scan;
-                value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex()));
+                value = incl_scan(value);
             }
             dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
         }
@@ -168,7 +171,8 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
@@ -224,7 +228,8 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
@@ -299,7 +304,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
@@ -374,7 +380,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex));
+            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();

From 90b19d817b7d5e9651ed755ff503873881e33311 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 8 May 2025 17:03:47 +0700
Subject: [PATCH 082/346] wg reduction uses reduce instead of scan

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 057e9ebd24..7ed16faf09 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -167,12 +167,12 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
-        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        subgroup2::reduction<params_lv0_t> reduction0;
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
-            scan_local[idx] = inclusiveScan0(scan_local[idx]);
+            scan_local[idx] = reduction0(scan_local[idx]);
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
@@ -183,14 +183,14 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 1 scan
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
-            lv1_val = inclusiveScan1(lv1_val);
+            lv1_val = reduction1(lv1_val);
             scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -200,7 +200,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             scalar_t reduce_val;
-            scratchAccessor.get(Config::SubgroupSize-1,reduce_val);
+            scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val);
             dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }
@@ -300,12 +300,12 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
-        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        subgroup2::reduction<params_lv0_t> reduction0;
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
-            scan_local[idx] = inclusiveScan0(scan_local[idx]);
+            scan_local[idx] = reduction0(scan_local[idx]);
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
@@ -316,14 +316,14 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 1 scan
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1)
         {
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
-            lv1_val = inclusiveScan1(lv1_val);
+            lv1_val = reduction1(lv1_val);
             if (subgroup::ElectLast())
             {
                 const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (invocationIndex/Config::ItemsPerInvocation_2);
@@ -333,14 +333,14 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 2 scan
-        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
+        subgroup2::reduction<params_lv2_t> reduction2;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv2_val[i]);
-            lv2_val = inclusiveScan2(lv2_val);
+            lv2_val = reduction2(lv2_val);
             scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -350,7 +350,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             scalar_t reduce_val;
-            scratchAccessor.get(Config::SubgroupSize-1,reduce_val);
+            scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val);
             dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }

From d2a16634dc52ecd1271d9a39cb6bcbe3ada2056c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 9 May 2025 14:03:47 +0700
Subject: [PATCH 083/346] fixes to calculating levels in config

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 70 +++++++++----------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 7ed16faf09..7ea8d6594b 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -23,7 +23,7 @@ namespace impl
 template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2+2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
 };
 
@@ -31,7 +31,7 @@ template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t Workg
 struct items_per_invocation
 {
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = conditional_value<VirtualWorkgroup::levels==1,uint16_t,uint16_t(0x1u)<<(WorkgroupSizeLog2-SubgroupSizeLog2),BaseItemsPerInvocation>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
 };
@@ -47,6 +47,7 @@ struct Configuration
 
     // must have at least enough level 0 outputs to feed a single subgroup
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 0x1u << SubgroupsPerVirtualWorkgroupLog2;
 
     using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
@@ -67,8 +68,9 @@ struct Configuration<11, 4, ITEMS_PER_INVOC>\
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 128u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 128u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\
     NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = 1u;\
@@ -106,13 +108,10 @@ struct reduce<Config, BinOp, 1, device_capabilities>
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
 
         subgroup2::reduction<params_t> reduction;
-        if (glsl::gl_SubgroupID() == 0)
-        {
-            vector_t value;
-            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
-            value = reduction(value);
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
-        }
+        vector_t value;
+        dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
+        value = reduction(value);
+        dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
     }
 };
 
@@ -129,22 +128,19 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
 
-        if (glsl::gl_SubgroupID() == 0)
+        vector_t value;
+        dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
+        if (Exclusive)
         {
-            vector_t value;
-            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
-            if (Exclusive)
-            {
-                subgroup2::exclusive_scan<params_t> excl_scan;
-                value = excl_scan(value);
-            }
-            else
-            {
-                subgroup2::inclusive_scan<params_t> incl_scan;
-                value = incl_scan(value);
-            }
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
+            subgroup2::exclusive_scan<params_t> excl_scan;
+            value = excl_scan(value);
+        }
+        else
+        {
+            subgroup2::inclusive_scan<params_t> incl_scan;
+            value = incl_scan(value);
         }
+        dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
     }
 };
 
@@ -176,7 +172,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -189,7 +185,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
@@ -233,7 +229,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -247,7 +243,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv1_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]);
             vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), lv1_val, bool(invocationIndex));
             shiftedInput = inclusiveScan1(shiftedInput);
             scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]);
@@ -309,7 +305,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -322,11 +318,11 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             if (subgroup::ElectLast())
             {
-                const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (invocationIndex/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
                 scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -339,7 +335,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv2_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
@@ -385,7 +381,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             if (subgroup::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -398,11 +394,11 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = inclusiveScan1(lv1_val);
             if (subgroup::ElectLast())
             {
-                const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
                 scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -416,7 +412,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv2_val[i]);
+                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv2_val[i]);
             vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(invocationIndex));
             shiftedInput = inclusiveScan2(shiftedInput);
             scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_2-1]);

From ea39d9e698867a97b0d1f75ff356119d11b12302 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 12 May 2025 16:17:49 +0700
Subject: [PATCH 084/346] fixes to 3-level scan

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 7ea8d6594b..1abd9cccd2 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -58,6 +58,8 @@ struct Configuration
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemSize = conditional_value<LevelCount==3,uint32_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1;
 };
 
 // special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096
@@ -388,8 +390,9 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 1 scan
+        const uint32_t lv1_smem_size = Config::SubgroupsPerVirtualWorkgroup*Config::ItemsPerInvocation_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1)
+        if (glsl::gl_SubgroupID() < lv1_smem_size)
         {
             vector_lv1_t lv1_val;
             [unroll]
@@ -398,8 +401,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             lv1_val = inclusiveScan1(lv1_val);
             if (subgroup::ElectLast())
             {
-                const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
-                scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -412,10 +415,20 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv2_val[i]);
+                scratchAccessor.get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
             vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(invocationIndex));
             shiftedInput = inclusiveScan2(shiftedInput);
-            scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_2-1]);
+
+            // combine with level 1, only last element of each
+            [unroll]
+            for (uint32_t i = 0; i < Config::SubgroupsPerVirtualWorkgroup; i++)
+            {
+                scalar_t last_val;
+                scratchAccessor.get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val);
+                scalar_t val = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(i));
+                val = binop(last_val, shiftedInput[Config::ItemsPerInvocation_2-1]);
+                scratchAccessor.set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val);
+            }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 

From f356185c87a5ce8ddf8deeeba4376f92d90aa3dd Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 13 May 2025 09:54:28 +0200
Subject: [PATCH 085/346] Make the staging cache reference counted and make
 failures propagate properly

---
 include/nbl/video/utilities/CAssetConverter.h |   9 +-
 src/nbl/video/utilities/CAssetConverter.cpp   | 357 +++++++++---------
 2 files changed, 182 insertions(+), 184 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index d9ace6226e..01da012a0d 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -973,7 +973,14 @@ class CAssetConverter : public core::IReferenceCounted
 
 			public:
 				template<asset::Asset AssetType>
-				using staging_cache_t = core::unordered_map<typename asset_traits<AssetType>::video_t*,typename CCache<AssetType>::key_t>;
+				struct staging_cache_key
+				{
+					core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t> gpuRef;
+					typename CCache<AssetType>::key_t cacheKey;
+				};
+				// it may seem weird storing both a smart pointer and a raw pointer, but the reason is to be able to drop a refcount while not loosing the key for lookup
+				template<asset::Asset AssetType>
+				using staging_cache_t = core::unordered_map<const typename asset_traits<AssetType>::video_t*,staging_cache_key<AssetType>>;
 
 				inline SReserveResult(SReserveResult&&) = default;
 				inline SReserveResult(const SReserveResult&) = delete;
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 7bfd361e94..de72e2f360 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2496,7 +2496,7 @@ struct conversions_t
 						return;
 					}
 					// insert into staging cache
-					stagingCache.emplace(gpuObj.get(),typename CAssetConverter::CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
+					stagingCache.emplace(gpuObj.get(),CAssetConverter::SReserveResult::staging_cache_key<AssetType>{gpuObj.value,typename CAssetConverter::CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID)});
 					// propagate back to dfsCache
 					created.gpuObj = std::move(gpuObj);
 				}
@@ -3534,12 +3534,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			if (const auto& gpuObj=found.gpuObj; gpuObj)
 			{
 				results[i] = gpuObj;
+#ifdef _NBL_DEBUG
 				// if something with this content hash is in the stagingCache, then it must match the `found->gpuObj`
 				if (auto finalCacheIt=stagingCache.find(gpuObj.get()); finalCacheIt!=stagingCache.end())
 				{
-					const bool matches = finalCacheIt->second==typename CCache<AssetType>::key_t(found.contentHash,uniqueCopyGroupID);
+					const bool matches = finalCacheIt->second.cacheKey==typename CCache<AssetType>::key_t(found.contentHash,uniqueCopyGroupID);
 					assert(matches);
 				}
+#endif
 			}
 			else
 				inputs.logger.log("No GPU Object could be found or created for Root Asset %p in group %d",system::ILogger::ELL_ERROR,asset,uniqueCopyGroupID);
@@ -3557,16 +3559,18 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				{
 					if (entry.first->getReferenceCount()==1)
 					{
+						// I know what I'm doing, the hashmap is being annoying not letting you look up with const pointer key a non const pointer hashmap
+						auto* gpuObj = const_cast<asset_traits<AssetType>::video_t*>(entry.first);
 						if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
-							retval.m_bufferConversions.erase(entry.first);
+							retval.m_bufferConversions.erase(gpuObj);
 						if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure>)
 						for (auto i=0; i<2; i++)
-							retval.m_blasConversions[i].erase(entry.first);
+							retval.m_blasConversions[i].erase(gpuObj);
 						if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
 						for (auto i=0; i<2; i++)
-							retval.m_tlasConversions[i].erase(entry.first);
+							retval.m_tlasConversions[i].erase(gpuObj);
 						if constexpr (std::is_same_v<AssetType,ICPUImage>)
-							retval.m_imageConversions.erase(entry.first);
+							retval.m_imageConversions.erase(gpuObj);
 						return true;
 					}
 					// still referenced, keep it around
@@ -3706,16 +3710,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		}
 	};
 
-	//
-	auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->core::blake3_hash_t*
-	{
-		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-		const auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(gpuObj));
-		assert(found!=stagingCache.end());
-		return const_cast<core::blake3_hash_t*>(&found->second.value);
-	};
 	// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
-	core::unordered_map<const IBackendObject*,uint32_t> outputReverseMap;
+	core::unordered_map<const IBackendObject*, uint32_t> outputReverseMap;
 	core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void
 		{
 			uint32_t i = 0;
@@ -3723,21 +3719,21 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				outputReverseMap[gpuObj.value.get()] = i++;
 		}
 	);
-	auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>& canonical, const typename asset_traits<AssetType>::video_t* gpuObj, core::blake3_hash_t* hash)->void
+	auto markFailure = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>* canonical, typename SReserveResult::staging_cache_t<AssetType>::mapped_type* cacheNode)->void
 	{
 		// wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
-		canonical = nullptr;
-		logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName());
-		// change the content hash on the reverse map to a NoContentHash
-		*hash = CHashCache::NoContentHash;
+		*canonical = nullptr;
 		// also drop the smart pointer from the output array so failures release memory quickly
-		const auto foundIx = outputReverseMap.find(gpuObj);
+		const auto foundIx = outputReverseMap.find(cacheNode->gpuRef.get());
 		if (foundIx!=outputReverseMap.end())
 		{
 			auto& resultOutput = std::get<SReserveResult::vector_t<AssetType>>(reservations.m_gpuObjects);
 			resultOutput[foundIx->second].value = nullptr;
 			outputReverseMap.erase(foundIx);
 		}
+		logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,cacheNode->gpuRef->getObjectDebugName());
+		// drop smart pointer 
+		cacheNode->gpuRef = nullptr;
 	};
 
 	// want to check if deps successfully exist
@@ -3751,10 +3747,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	};
 	auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->SMissingDependent
 	{
-		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-		auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(dep));
+		const auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+		const auto found = stagingCache.find(dep);
 		SMissingDependent retval = {.wasInStaging=found!=stagingCache.end()};
-		retval.gotWiped = retval.wasInStaging && found->second.value==CHashCache::NoContentHash;
+		retval.gotWiped = retval.wasInStaging && !found->second.gpuRef;
 		return retval;
 	};
 
@@ -3975,6 +3971,15 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		// some state so we don't need to look later
 		auto xferCmdBuf = shouldDoSomeTransfer ? params.transfer->getCommandBufferForRecording():nullptr;
 
+		//
+		auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->auto
+		{
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+			const auto found = stagingCache.find(gpuObj);
+			assert(found!=stagingCache.end());
+			return found;
+		};
+
 		using buffer_mem_barrier_t = IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>;
 		// upload Buffers
 		auto& buffersToUpload = reservations.m_bufferConversions;
@@ -3994,12 +3999,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				// host will upload
 				if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size))
 					continue;
-				auto pFoundHash = findInStaging.template operator()<ICPUBuffer>(buffer);
+				auto pFound = &findInStaging.template operator()<ICPUBuffer>(buffer)->second;
 				//
-				const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily);
+				const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,pFound->cacheKey.value),transferFamily);
 				if (ownerQueueFamily==QueueFamilyInvalid)
 				{
-					markFailureInStaging("invalid Final Queue Family given by user callback",item.second,buffer,pFoundHash);
+					markFailure("invalid Final Queue Family given by user callback",&item.second,pFound);
 					continue;
 				}
 				// do the upload
@@ -4009,7 +4014,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				xferCmdBuf = params.transfer->getCommandBufferForRecording();
 				if (!success)
 				{
-					markFailureInStaging("Data Upload",item.second,buffer,pFoundHash);
+					markFailure("Data Upload",&item.second,pFound);
 					continue;
 				}
 				// let go of canonical asset (may free RAM)
@@ -4175,7 +4180,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				// basiscs
 				auto& cpuImg = item.second.canonical;
 				auto* image = item.first;
-				auto pFoundHash = findInStaging.template operator()<ICPUImage>(image);
+				auto pFound = &findInStaging.template operator()<ICPUImage>(image)->second;
 				// get params
 				const auto& creationParams = image->getCreationParameters();
 				const auto format = creationParams.format;
@@ -4225,7 +4230,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					}
 					if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView)))
 					{
-						markFailureInStaging("Source Mip Level Descriptor Write",cpuImg,image,pFoundHash);
+						markFailure("Source Mip Level Descriptor Write",&cpuImg,pFound);
 						continue;
 					}
 				}
@@ -4246,7 +4251,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						}
 						else
 						{
-							markFailureInStaging("Image QFOT Pipeline Barrier",cpuImg,image,pFoundHash);
+							markFailure("Image QFOT Pipeline Barrier",&cpuImg,pFound);
 							return false;
 						}
 						return true;
@@ -4295,7 +4300,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// if we're recomputing this mip level 
 						const bool recomputeMip = lvl && (recomputeMipMask&(0x1u<<(lvl-1)));
 						// query final layout from callback
-						const auto finalLayout = params.getFinalLayout(image,*pFoundHash,lvl);
+						const auto finalLayout = params.getFinalLayout(image,pFound->cacheKey.value,lvl);
 						// get region data for upload
 						auto regions = cpuImg->getRegions(lvl);
 						// basic error checks
@@ -4306,7 +4311,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							logger.log("What are you doing requesting layout UNDEFINED for mip level % of image %s after Upload or Mip Recomputation!?",system::ILogger::ELL_ERROR,lvl,image->getObjectDebugName());
 							break;
 						}
-						const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,*pFoundHash,lvl);
+						const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,pFound->cacheKey.value,lvl);
 						// if we'll recompute the mipmap, then do the layout transition on the compute queue (there's one less potential QFOT)
 						if (recomputeMip)
 						{
@@ -4561,7 +4566,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					// failed in the for-loop
 					if (lvl != creationParams.mipLevels)
 					{
-						markFailureInStaging("Compute Mip Mapping",cpuImg,image,pFoundHash);
+						markFailure("Compute Mip Mapping",&cpuImg,pFound);
 						continue;
 					}
 					// let go of canonical asset (may free RAM)
@@ -4572,7 +4577,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				{
 					if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed"))
 					{
-						markFailureInStaging("Image Data Upload Pipeline Barrier",cpuImg,image,pFoundHash);
+						markFailure("Image Data Upload Pipeline Barrier",&cpuImg,pFound);
 						continue;
 					}
 					// even if no uploads performed, we do layout transitions on empty images from Xfer Queue
@@ -4584,7 +4589,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore());
 					if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed"))
 					{
-						markFailureInStaging("Compute Mip Mapping Pipeline Barrier",cpuImg,image,pFoundHash);
+						markFailure("Compute Mip Mapping Pipeline Barrier",&cpuImg,pFound);
 						continue;
 					}
 				}
@@ -4659,9 +4664,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				if (!success)
 				for (const auto& info : buildInfos)
 				{
-					const auto pFoundHash = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
+					const auto stagingFound = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
 					smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
-					markFailureInStaging("AS Build Command Recording",dummy,info.dstAS,pFoundHash);
+					markFailure("AS Build Command Recording",&dummy,&stagingFound->second);
 				}
 				buildInfos.clear();
 				rangeInfos.clear();
@@ -4710,14 +4715,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			for (auto& item : blasToBuild)
 			{
 				auto* as = item.gpuObj;
-				auto pFoundHash = findInStaging.template operator()<ICPUBottomLevelAccelerationStructure>(as);
+				auto pFound = &findInStaging.template operator()<ICPUBottomLevelAccelerationStructure>(as)->second;
 				if (item.asBuildParams.host)
 				{
 					auto dOp = device->createDeferredOperation();
 					//
 					if (!device->buildAccelerationStructure(dOp.get(),info,range))
 					{
-						markFailureInStaging("BLAS Build Command Recording",item.canonical,gpuObj,pFoundHash);
+						markFailure("BLAS Build Command Recording",&item.canonical,pFound);
 						continue;
 					}
 				}
@@ -4811,13 +4816,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					{
 						auto& canonical = tlasToBuild.second.canonical;
 						const auto as = tlasToBuild.first;
-						const auto pFoundHash = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(as);
+						const auto pFound = &findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(as)->second;
 						const auto& backingRange = as->getCreationParams().bufferRange;
 						// checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters
-						const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,*pFoundHash),computeFamily);
+						const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,pFound->cacheKey.value),computeFamily);
 						if (finalOwnerQueueFamily==QueueFamilyInvalid)
 						{
-							markFailureInStaging("invalid Final Queue Family given by user callback",canonical,as,pFoundHash);
+							markFailure("invalid Final Queue Family given by user callback",&canonical,pFound);
 							continue;
 						}
 						const auto instances = canonical->getInstances();
@@ -4843,13 +4848,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// problem with building some Dependent BLASes
 						if (failedBLASBarrier && dependsOnBLASBuilds)
 						{
-							markFailureInStaging("building BLASes which current TLAS build wants to instance",canonical,as,pFoundHash);
+							markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound);
 							continue;
 						}
 						// problem with finding the dependents (BLASes)
 						if (instanceDataSize==0)
 						{
-							markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",canonical,as,pFoundHash);
+							markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound);
 							continue;
 						}
 						// allocate scratch and build inputs
@@ -4954,7 +4959,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							if (!success)
 							{
 								trackedBLASes.resize(trackedBLASesOffset);
-								markFailureInStaging("Uploading Instance Data for TLAS build failed",canonical,as,pFoundHash);
+								markFailure("Uploading Instance Data for TLAS build failed",&canonical,pFound);
 								continue;
 							}
 							// let go of canonical asset (may free RAM)
@@ -5165,159 +5170,145 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	
 	// finish host tasks if not done yet
 	hostUploadBuffers([]()->bool{return true;});
+	// in the future we'll also finish host image copies
 
-	// insert items into cache if overflows handled fine and commandbuffers ready to be recorded
-	auto mergeCache = [&]<Asset AssetType>()->void
+	// check dependents before inserting into cache
+	if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
 	{
-		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-		auto& cache = std::get<CCache<AssetType>>(m_caches);
-		cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size());
-		cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size());
-		constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
-		for (auto& item : stagingCache)
-		if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped
+		auto checkDependents = [&]<Asset AssetType>()->void
 		{
-			// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
-			bool depsMissing = false;
-			// only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
-			if constexpr (IsTLAS)
-			{
-				// A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job
-			}
-
-			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
-				depsMissing = missingDependent.template operator()<ICPUBuffer>(item.first->getUnderlyingBuffer());
-			if constexpr (std::is_same_v<AssetType,ICPUImageView>)
-				depsMissing = missingDependent.template operator()<ICPUImage>(item.first->getCreationParameters().image.get());
-			if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
-			{
-				const IGPUDescriptorSetLayout* layout = item.first->getLayout();
-				// check samplers
-				{
-					const auto count = layout->getTotalMutableCombinedSamplerCount();
-					const auto* samplers = item.first->getAllMutableCombinedSamplers();
-					for (auto i=0u; !depsMissing && i<count; i++)
-					if (samplers[i])
-						depsMissing = missingDependent.template operator()<ICPUSampler>(samplers[i].get());
-				}
-				for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+			phmap::erase_if(stagingCache,[&](auto& item)->bool
 				{
-					const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
-					const auto count = layout->getTotalDescriptorCount(type);
-					auto* psDescriptors = item.first->getAllDescriptors(type);
-					if (!psDescriptors)
-						continue;
-					for (auto i=0u; !depsMissing && i<count; i++)
+					auto* pGpuObj = item.first;
+					// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
+					bool depsMissing = false;
+					if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
+						depsMissing = missingDependent.template operator()<ICPUBuffer>(pGpuObj->getUnderlyingBuffer());
+					if constexpr (std::is_same_v<AssetType,ICPUImageView>)
+						depsMissing = missingDependent.template operator()<ICPUImage>(pGpuObj->getCreationParameters().image.get());
+					if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
 					{
-						auto* untypedDesc = psDescriptors[i].get();
-						if (untypedDesc)
-						switch (asset::IDescriptor::GetTypeCategory(type))
+						const IGPUDescriptorSetLayout* layout = pGpuObj->getLayout();
+						// check samplers
 						{
-							case asset::IDescriptor::EC_BUFFER:
-								depsMissing = missingDependent.template operator()<ICPUBuffer>(static_cast<const IGPUBuffer*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_SAMPLER:
-								depsMissing = missingDependent.template operator()<ICPUSampler>(static_cast<const IGPUSampler*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_IMAGE:
-								depsMissing = missingDependent.template operator()<ICPUImageView>(static_cast<const IGPUImageView*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_BUFFER_VIEW:
-								depsMissing = missingDependent.template operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
-								depsMissing = missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc));
-								break;
-							default:
-								assert(false);
-								depsMissing = true;
-								break;
+							const auto count = layout->getTotalMutableCombinedSamplerCount();
+							const auto* samplers = pGpuObj->getAllMutableCombinedSamplers();
+							for (auto i=0u; !depsMissing && i<count; i++)
+							if (samplers[i])
+								depsMissing = missingDependent.template operator()<ICPUSampler>(samplers[i].get());
+						}
+						for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
+						{
+							const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
+							const auto count = layout->getTotalDescriptorCount(type);
+							auto* psDescriptors = pGpuObj->getAllDescriptors(type);
+							if (!psDescriptors)
+								continue;
+							for (auto i=0u; !depsMissing && i<count; i++)
+							{
+								auto* untypedDesc = psDescriptors[i].get();
+								if (untypedDesc)
+								switch (asset::IDescriptor::GetTypeCategory(type))
+								{
+									case asset::IDescriptor::EC_BUFFER:
+										depsMissing = missingDependent.template operator()<ICPUBuffer>(static_cast<const IGPUBuffer*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_SAMPLER:
+										depsMissing = missingDependent.template operator()<ICPUSampler>(static_cast<const IGPUSampler*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_IMAGE:
+										depsMissing = missingDependent.template operator()<ICPUImageView>(static_cast<const IGPUImageView*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_BUFFER_VIEW:
+										depsMissing = missingDependent.template operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
+										depsMissing = missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc));
+										break;
+									default:
+										assert(false);
+										depsMissing = true;
+										break;
+								}
+							}
 						}
 					}
+					if (depsMissing)
+					{
+						smart_refctd_ptr<const AssetType> dummy;
+						// I know what I'm doing (breaking the promise of the `erase_if` to not mutate the inputs)
+						markFailure("because conversion of a dependant failed!",&dummy,&item.second);
+					}
+					return depsMissing;
 				}
-			}
-			auto* pGpuObj = item.first;
-			if (depsMissing)
-			{
-				logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!",system::ILogger::ELL_ERROR,pGpuObj->getObjectDebugName());
-				// wipe self, to let users know
-				item.second.value = {};
-				continue;
-			}
-			// The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away
-			// For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache
-			if constexpr (IsTLAS)
+			);
+		};
+		// Bottom up, only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
+		// A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job
+		checkDependents.template operator()<ICPUBufferView>();
+		checkDependents.template operator()<ICPUImageView>();
+		checkDependents.template operator()<ICPUDescriptorSet>();
+//		mergeCache.template operator()<ICPUFramebuffer>();
+		// overwrite the compacted TLASes in Descriptor Sets
+		if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty())
+		{
+			core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
+			writes.reserve(tlasRewriteSet.size());
+			core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(tlasRewriteSet.size());
+			auto* pInfo = infos.data();
+			for (auto& entry : tlasRewriteSet)
 			{
-				auto found = compactedTLASMap.find(pGpuObj);
-				if (found!=compactedTLASMap.end())
-					pGpuObj = found->second.get();
-
+				auto* const dstSet = entry.dstSet;
+				// we need to check if the descriptor set itself didn't get deleted in the meantime
+				if (missingDependent.template operator()<ICPUDescriptorSet>(dstSet))
+					continue;
+				// rewtrieve the binding from the TLAS
+				const auto* const tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get());
+				assert(tlas);
+				// only rewrite if successfully compacted
+				if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end())
+				{
+					pInfo->desc = foundCompacted->second;
+					using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
+					const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
+					const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset);
+					const auto firstElementOffset = redirect.getStorageOffset(bindingRange);
+					writes.push_back({
+						.dstSet = dstSet,
+						.binding = redirect.getBinding(bindingRange).data,
+						.arrayElement = entry.storageOffset.data-firstElementOffset.data,
+						.count = 1,
+						.info = pInfo++
+					});
+				}
 			}
-			// We have success now, but ask callback if we write to the new cache.
-			if (!params.writeCache(item.second)) // TODO: let the user know the pointer to the GPU Object too?
-				continue;
-			asset_cached_t<AssetType> cached;
-			cached.value = core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t>(pGpuObj);
-			cache.m_reverseMap.emplace(pGpuObj,item.second);
-			cache.m_forwardMap.emplace(item.second,std::move(cached));
+			// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
+			if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
+				logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR);
 		}
-	};
-	// again, need to go bottom up so we can check dependencies being successes
-	mergeCache.template operator()<ICPUBuffer>();
-	mergeCache.template operator()<ICPUImage>();
-	mergeCache.template operator()<ICPUBottomLevelAccelerationStructure>();
-	mergeCache.template operator()<ICPUTopLevelAccelerationStructure>();
-	mergeCache.template operator()<ICPUBufferView>();
-	mergeCache.template operator()<ICPUImageView>();
-	mergeCache.template operator()<ICPUShader>();
-	mergeCache.template operator()<ICPUSampler>();
-	mergeCache.template operator()<ICPUDescriptorSetLayout>();
-	mergeCache.template operator()<ICPUPipelineLayout>();
-	mergeCache.template operator()<ICPUPipelineCache>();
-	mergeCache.template operator()<ICPUComputePipeline>();
-	mergeCache.template operator()<ICPURenderpass>();
-	mergeCache.template operator()<ICPUGraphicsPipeline>();
-	// overwrite the compacted TLASes in Descriptor Sets
-	if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty())
+	}
+
+	// insert items into cache if overflows handled fine and commandbuffers ready to be recorded
+	core::for_each_in_tuple(reservations.m_stagingCaches,[&]<typename AssetType>(SReserveResult::staging_cache_t<AssetType>& stagingCache)->void
 	{
-		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
-		writes.reserve(tlasRewriteSet.size());
-		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(tlasRewriteSet.size());
-		auto* pInfo = infos.data();
-		for (auto& entry : tlasRewriteSet)
+		auto& cache = std::get<CCache<AssetType>>(m_caches);
+		cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size());
+		cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size());
+		for (auto& item : stagingCache)
+		if (item.second.gpuRef) // not wiped
 		{
-			auto* const dstSet = entry.dstSet;
-			// we need to check if the descriptor set itself didn't get deleted in the meantime
-			auto& stagingCache = std::get<SReserveResult::staging_cache_t<ICPUDescriptorSet>>(reservations.m_stagingCaches);
-			const auto found = stagingCache.find(dstSet);
-			if (found==stagingCache.end())
+			// We have success now, but ask callback if we write to the new cache.
+			if (!params.writeCache(item.second.cacheKey)) // TODO: let the user know the pointer to the GPU Object too?
 				continue;
-			// rewtrieve the binding from the TLAS
-			const auto* const tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get());
-			assert(tlas);
-			// only rewrite if successfully compacted
-			if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end())
-			{
-				pInfo->desc = foundCompacted->second;
-				using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
-				const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
-				const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset);
-				const auto firstElementOffset = redirect.getStorageOffset(bindingRange);
-				writes.push_back({
-					.dstSet = dstSet,
-					.binding = redirect.getBinding(bindingRange).data,
-					.arrayElement = entry.storageOffset.data-firstElementOffset.data,
-					.count = 1,
-					.info = pInfo++
-				});
-			}
+			asset_cached_t<AssetType> cached;
+			cached.value = std::move(item.second.gpuRef);
+			cache.m_reverseMap.emplace(item.first,item.second.cacheKey);
+			cache.m_forwardMap.emplace(item.second.cacheKey,std::move(cached));
 		}
-		// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
-		if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
-			logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR);
-	}
-	mergeCache.template operator()<ICPUDescriptorSet>();
-//	mergeCache.template operator()<ICPUFramebuffer>();
+		// provoke refcounting bugs ASAP
+		stagingCache.clear();
+	});
 
 	// no submit was necessary, so should signal the extra semaphores from the host
 	if (!retval.blocking())

From 0b791b545b40734c17c240dca92837ebcf8cb5c5 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:26:59 +0700
Subject: [PATCH 086/346] Fix discardDependantsContents and
 anyDependantDiscardedContents to use computeDependants

---
 include/nbl/asset/IPreHashed.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h
index 4bc5ca5dcd..4ffda209df 100644
--- a/include/nbl/asset/IPreHashed.h
+++ b/include/nbl/asset/IPreHashed.h
@@ -43,28 +43,28 @@ class IPreHashed : public IAsset
 		{
 			struct stack_entry_t
 			{
-				IAsset* asset;
-				size_t childCount = 0;
-				size_t childrenVisited = 0;
+				const IAsset* asset;
+				core::unordered_set<const IAsset*> unvisitedChilds;
 			};
 			core::stack<stack_entry_t> stack;
 			core::unordered_set<const IAsset*> alreadyVisited;
-			auto push = [&stack,&alreadyVisited](IAsset* node) -> void
+			auto push = [&stack,&alreadyVisited](const IAsset* node) -> void
 			{
 				if (!node)
 					return;
 				const auto [dummy,inserted] = alreadyVisited.insert(node);
 				if (inserted)
-					stack.push({.asset=node,.childCount=node->getDependantCount()});
+					stack.push({ .asset = node, .unvisitedChilds = node->computeDependants()});
 			};
 			for (const auto& root : roots)
 				push(root);
 			while (!stack.empty())
 			{
 				auto& entry = stack.top();
-				if (entry.childrenVisited<entry.childCount)
+				if (entry.unvisitedChilds.size() > 0)
 				{
-					const auto dep = entry.asset->getDependant(entry.childrenVisited++);
+					auto dep = *entry.unvisitedChilds.begin();
+					entry.unvisitedChilds.erase(entry.unvisitedChilds.begin());
 					push(dep);
 				}
 				else
@@ -82,8 +82,7 @@ class IPreHashed : public IAsset
 			struct stack_entry_t
 			{
 				const IAsset* asset;
-				size_t childCount = 0;
-				size_t childrenVisited = 0;
+				core::unordered_set<const IAsset*> unvisitedChilds;
 			};
 			core::stack<stack_entry_t> stack;
 			core::unordered_set<const IAsset*> alreadyVisited;
@@ -97,7 +96,7 @@ class IPreHashed : public IAsset
 					auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
 					if (isPrehashed && isPrehashed->missingContent())
 						return true;
-					stack.push({.asset=node,.childCount=node->getDependantCount()});
+					stack.push({ .asset = node, .unvisitedChilds = node->computeDependants() });
 				}
 				return false;
 			};
@@ -106,9 +105,11 @@ class IPreHashed : public IAsset
 			while (!stack.empty())
 			{
 				auto& entry = stack.top();
-				if (entry.childrenVisited<entry.childCount)
+				auto& unvisitedChilds = entry.unvisitedChilds;
+				if (unvisitedChilds.size() > 0)
 				{
-					const auto dep = entry.asset->getDependant(entry.childrenVisited++);
+					auto dep = *unvisitedChilds.begin();
+					unvisitedChilds.erase(unvisitedChilds.begin());
 					if (push(dep))
 						return true;
 				}

From e8e43b1fe68f981f8b583941e0b90c359f51fbde Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:30:56 +0700
Subject: [PATCH 087/346] Add Ray Tracing Pipeline Asset to IAsset

---
 include/nbl/asset/IAsset.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index 3802536029..a1689daa63 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -94,6 +94,7 @@ class IAsset : virtual public core::IReferenceCounted
 			ET_COMPUTE_PIPELINE = 1ull<<20,                     //!< asset::ICPUComputePipeline
 			ET_PIPELINE_CACHE = 1ull<<21,						//!< asset::ICPUPipelineCache
 			ET_SCENE = 1ull<<22,								//!< reserved, to implement later
+			ET_RAYTRACING_PIPELINE = 1ull << 23, //!< asset::ICPURayTracingPipeline
 			ET_IMPLEMENTATION_SPECIFIC_METADATA = 1ull<<31u,    //!< lights, etc.
 			//! Reserved special value used for things like terminating lists of this enum
 

From b9db6aa2e1b8a2297c621daab047b757e3b47c36 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:32:52 +0700
Subject: [PATCH 088/346] Remove unnecessary specInfo assignment in clone
 method

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 62b25443cc..e376300121 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -29,8 +29,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
-            for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++)
-                newPipeline->m_specInfos[i] = m_specInfos[i];
             newPipeline->m_params = m_params;
             newPipeline->m_renderpass = m_renderpass;
             

From 2ae6f7818428562f73ead04408a3ffa55e32066a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:40:42 +0700
Subject: [PATCH 089/346] Move subgroup argument to computePipelineBase

---
 include/nbl/asset/ICPUPipeline.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 8b90458f21..ae2c64372d 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -69,8 +69,6 @@ class ICPUPipelineBase
 
             core::smart_refctd_ptr<IShader> shader = nullptr;
             std::string entryPoint = "";
-            IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
-            uint8_t requireFullSubgroups : 1 = false;
 
             // Container choice implicitly satisfies:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911

From 8de6d9a5992b8ff227a9e24cd9e0026ba1e49b80 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:41:08 +0700
Subject: [PATCH 090/346] Remove getDependantCount and getDependant and
 getDependant_impl from IAsset

---
 include/nbl/asset/IAsset.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index a1689daa63..c3950c4912 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -156,20 +156,6 @@ class IAsset : virtual public core::IReferenceCounted
 		//!
 		inline bool isMutable() const {return m_mutable;}
 
-		//!
-		virtual size_t getDependantCount() const = 0;
-		inline IAsset* getDependant(const size_t ix)
-		{
-			if (ix<getDependantCount())
-				return getDependant_impl(ix);
-			return nullptr;
-		}
-		inline const IAsset* getDependant(const size_t ix) const
-		{
-			IAsset* const retval = const_cast<IAsset*>(this)->getDependant(ix);
-			return retval;
-		}
-
 		virtual core::unordered_set<const IAsset*> computeDependants() const = 0;
 
     virtual bool valid() const = 0;
@@ -179,8 +165,6 @@ class IAsset : virtual public core::IReferenceCounted
 		//! Pure virtual destructor to ensure no instantiation
 		NBL_API2 virtual ~IAsset() = 0;
 
-		virtual IAsset* getDependant_impl(const size_t ix) = 0;
-
 	private:
 		friend IAssetManager;
 		bool m_mutable = true;

From 3f6599267befa369bc171f21dac3af67d06f7a0d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:41:41 +0700
Subject: [PATCH 091/346] Implement computeDependants for ICPUGraphicsPIpeline

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 26 ++++--------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index e376300121..0629f82f1c 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -43,15 +43,12 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        inline size_t getDependantCount() const override
+        virtual core::unordered_set<const IAsset*> computeDependants() const override
         {
-            auto stageCount = 2; // the layout and renderpass
+            core::unordered_set<const IAsset*> dependants = { m_layout.get(), m_renderpass.get()};
             for (const auto& info : m_specInfos)
-            {
-              if (info.shader)
-                stageCount++;
-            }
-            return stageCount;
+              if (info.shader) dependants.insert(info.shader.get());
+            return dependants;
         }
 
         inline SCachedCreationParams& getCachedCreationParams()
@@ -90,21 +87,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         using base_t::base_t;
         virtual ~ICPUGraphicsPipeline() override = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            if (ix==0)
-                return const_cast<ICPUPipelineLayout*>(m_layout.get());
-            if (ix==1)
-                return m_renderpass.get();
-            size_t stageCount = 0;
-            for (auto& specInfo : m_specInfos)
-            {
-                if (specInfo.shader)
-                    if ((stageCount++)==ix-2) return specInfo.shader.get();
-            }
-            return nullptr;
-        }
-
         std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
 
     private:

From 89b8daaaf6618b4da8472629b3408ff55f85539e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:42:03 +0700
Subject: [PATCH 092/346] Implement computeDependants for
 ICPURayTracingPIpeline

---
 include/nbl/asset/ICPURayTracingPipeline.h | 88 ++++++++++++----------
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 23a1d82225..5be344d1f2 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -19,22 +19,10 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         using base_t = ICPUPipeline<pipeline_base_t>;
 
     public:
-        struct SHitGroupSpecInfo {
-            SShaderSpecInfo closestHit;
-            SShaderSpecInfo anyHit;
-            SShaderSpecInfo intersection;
-
-            SHitGroupSpecInfo clone(uint32_t depth) const
-            {
-                auto newSpecInfo = *this;
-                if (depth > 0u)
-                {
-                    newSpecInfo.closestHit.shader = core::smart_refctd_ptr_static_cast<IShader>(this->closestHit.shader->clone(depth - 1u));
-                    newSpecInfo.anyHit.shader = core::smart_refctd_ptr_static_cast<IShader>(this->anyHit.shader->clone(depth - 1u));
-                    newSpecInfo.intersection.shader = core::smart_refctd_ptr_static_cast<IShader>(this->intersection.shader->clone(depth - 1u));
-                }
-                return newSpecInfo;
-            }
+        struct SHitGroupSpecInfos {
+            core::vector<SShaderSpecInfo> closestHits;
+            core::vector<SShaderSpecInfo> anyHits;
+            core::vector<SShaderSpecInfo> intersections;
         };
 
         static core::smart_refctd_ptr<ICPURayTracingPipeline> create(const ICPUPipelineLayout* layout)
@@ -48,23 +36,18 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             auto newPipeline = new ICPURayTracingPipeline(layout.get());
             newPipeline->m_raygen = m_raygen.clone(depth);
 
-            newPipeline->m_misses.resize(m_misses.size());
-            for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++)
-            {
-                newPipeline->m_misses[specInfo_i] = m_misses[specInfo_i].clone(depth);
-            }
-
-            newPipeline->m_hitGroups.resize(m_hitGroups.size());
-            for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++)
-            {
-                newPipeline->m_hitGroups[specInfo_i] = m_hitGroups[specInfo_i].clone(depth);
-            }
-
-            newPipeline->m_callables.resize(m_callables.size());
-            for (auto specInfo_i = 0u; specInfo_i < m_callables.size(); specInfo_i++)
-            {
-                newPipeline->m_callables[specInfo_i] = m_callables[specInfo_i].clone(depth);
-            }
+            auto cloneSpecInfos = [depth](const core::vector<SShaderSpecInfo>& specInfos) -> core::vector<SShaderSpecInfo> {
+                core::vector<SShaderSpecInfo> results;
+                results.resize(specInfos.size());
+                for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++)
+                    results[specInfo_i] = specInfos[specInfo_i].clone(depth);
+                return results;
+            };
+            newPipeline->m_misses = cloneSpecInfos(m_misses);
+            newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits);
+            newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits);
+            newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections);
+            newPipeline->m_callables = cloneSpecInfos(m_callables);
 
             newPipeline->m_params = m_params;
             return core::smart_refctd_ptr<base_t>(newPipeline);
@@ -75,17 +58,39 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         
         //!
         inline size_t getDependantCount() const override { 
-            //TODO(kevinyu): Implement or refactor the api design to something else
+            //TODO(kevinyu): Remove this function use computeDependants
             return 0;
         }
 
+        virtual core::unordered_set<const IAsset*> computeDependants() const override final {
+            core::unordered_set<const IAsset*> dependants;
+            dependants.insert(m_raygen.shader.get());
+            for (const auto& missInfo : m_misses) dependants.insert(missInfo.shader.get());
+            for (const auto& anyHitInfo : m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get());
+            for (const auto& closestHitInfo : m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get());
+            for (const auto& intersectionInfo : m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get());
+            for (const auto& callableInfo : m_callables) dependants.insert(callableInfo.shader.get());
+            return dependants;
+        }
+
         inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
         {
-          switch (stage) 
-          {
-            case hlsl::ShaderStage::ESS_RAYGEN:
-              return { &m_raygen, 1 };
-          }
+            switch (stage) 
+            {
+                case hlsl::ShaderStage::ESS_RAYGEN:
+                  return { &m_raygen, 1 };
+                case hlsl::ShaderStage::ESS_MISS:
+                  return m_misses;
+                case hlsl::ShaderStage::ESS_ANY_HIT:
+                  return m_hitGroups.anyHits;
+                case hlsl::ShaderStage::ESS_CLOSEST_HIT:
+                  return m_hitGroups.closestHits;
+                case hlsl::ShaderStage::ESS_INTERSECTION:
+                  return m_hitGroups.intersections;
+                case hlsl::ShaderStage::ESS_CALLABLE:
+                  return m_callables;
+
+            }
             return {};
         }
 
@@ -100,7 +105,8 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
 
         inline IAsset* getDependant_impl(const size_t ix) override
         {
-            //TODO(kevinyu): remove this function, since this is expensive
+            //TODO(kevinyu): remove this function, use computeDependants
+            assert(false);
             return nullptr;
         }
 
@@ -109,7 +115,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         
         SShaderSpecInfo m_raygen;
         core::vector<SShaderSpecInfo> m_misses;
-        core::vector<SHitGroupSpecInfo> m_hitGroups;
+        SHitGroupSpecInfos m_hitGroups;
         core::vector<SShaderSpecInfo> m_callables;
 
         explicit ICPURayTracingPipeline(const ICPUPipelineLayout* layout)

From 434d73e3063ef5a343ebf9a6909fbbb688a9553a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:42:26 +0700
Subject: [PATCH 093/346] Fix IGraphicsPIpeline constructor

---
 include/nbl/asset/IGraphicsPipeline.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index ef49e4c03a..090a368c2f 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -110,7 +110,8 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
 
     protected:
         explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) :
-            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
+            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)),
+            m_params(cachedParams), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
         {}
 
         SCachedCreationParams m_params = {};

From 1cd1771429d4bbb0c563273ad3f522dfa05e5c34 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:42:46 +0700
Subject: [PATCH 094/346] Remove SUBGROUP_SIZE from IPIpeline

---
 include/nbl/asset/IPipeline.h | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index c458c34afe..eb64de0b0d 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -105,21 +105,6 @@ class IPipelineBase
       };
       using FLAGS = CreationFlags;
 
-      // Nabla requires device's reported subgroup size to be between 4 and 128
-      enum class SUBGROUP_SIZE : uint8_t
-      {
-        // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
-        UNKNOWN = 0,
-        // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
-        VARYING = 1,
-        // The rest we encode as log2(x) of the required value
-        REQUIRE_4 = 2,
-        REQUIRE_8 = 3,
-        REQUIRE_16 = 4,
-        REQUIRE_32 = 5,
-        REQUIRE_64 = 6,
-        REQUIRE_128 = 7
-      };
 
 };
 template<typename PipelineLayout>

From 5823a841f965293c6a53ca24dbdc3a91405d9913 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:43:21 +0700
Subject: [PATCH 095/346] Refactor IRayTracingPipeline to use new
 SShaderSpecInfo scheme

---
 include/nbl/asset/IRayTracingPipeline.h | 172 +-----------------------
 1 file changed, 5 insertions(+), 167 deletions(-)

diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h
index 0bc2d68653..50ab7ba3f3 100644
--- a/include/nbl/asset/IRayTracingPipeline.h
+++ b/include/nbl/asset/IRayTracingPipeline.h
@@ -14,35 +14,6 @@ namespace nbl::asset
 class IRayTracingPipelineBase : public virtual core::IReferenceCounted
 {
   public:
-    struct SShaderGroupsParams
-    {
-      struct SIndex
-      {
-        constexpr static inline uint32_t Unused = 0xffFFffFFu;
-        uint32_t index = Unused;
-      };
-
-      struct SHitGroup
-      {
-        uint32_t closestHit = SIndex::Unused;
-        uint32_t anyHit = SIndex::Unused;
-        uint32_t intersection = SIndex::Unused;
-      };
-
-      SIndex raygen;
-      std::span<SIndex> misses;
-      std::span<SHitGroup> hits;
-      std::span<SIndex> callables;
-
-      inline uint32_t getShaderGroupCount() const
-      {
-        return 1 + hits.size() + misses.size() + callables.size();
-      }
-
-    };
-    using SGeneralShaderGroup = SShaderGroupsParams::SIndex;
-    using SHitShaderGroup = SShaderGroupsParams::SHitGroup;
-
     struct SCachedCreationParams final
     {
       uint32_t maxRecursionDepth : 6 = 0;
@@ -53,152 +24,19 @@ class IRayTracingPipelineBase : public virtual core::IReferenceCounted
 template<typename PipelineLayoutType>
 class IRayTracingPipeline : public IPipeline<PipelineLayoutType>, public IRayTracingPipelineBase
 {
-    using base_creation_params_t = IPipeline<PipelineLayoutType>::SCreationParams;
-  public:
-
-    using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array<SGeneralShaderGroup>;
-    using SHitShaderGroupContainer = core::smart_refctd_dynamic_array<SHitShaderGroup>;
-
-    struct SCreationParams : base_creation_params_t
-    {
-      public:
-      #define base_flag(F) static_cast<uint64_t>(base_creation_params_t::FLAGS::F)
-      enum class FLAGS : uint64_t
-      {
-          NONE = base_flag(NONE),
-          DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
-          ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
-          FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
-          EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
-          SKIP_BUILT_IN_PRIMITIVES = 1<<12,
-          SKIP_AABBS = 1<<13,
-          NO_NULL_ANY_HIT_SHADERS = 1<<14,
-          NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
-          NO_NULL_MISS_SHADERS = 1<<16,
-          NO_NULL_INTERSECTION_SHADERS = 1<<17,
-          ALLOW_MOTION = 1<<20,
-      };
-      #undef base_flag
-
-      protected:
-        using SpecInfo = IPipelineBase::SShaderSpecInfo;
-        template<typename ExtraLambda>
-        inline bool impl_valid(ExtraLambda&& extra) const
-        {
-          if (!IPipeline<PipelineLayoutType>::SCreationParams::layout)
-            return false;
+    using base_creation_params_t = IPipeline<PipelineLayoutType>;
 
-          for (const auto info : shaders)
-          {
-            if (info.shader)
-            {
-              if (!extra(info))
-                return false;
-              const auto stage = info.stage;
-              if ((stage & ~IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING) != 0)
-                return false;
-              if (!std::has_single_bit<std::underlying_type_t<IShader::E_SHADER_STAGE>>(stage))
-                return false;
-            }
-            else
-            {
-              // every shader must not be null. use SIndex::Unused to represent unused shader.
-              return false;
-            }
-          }
-
-          auto getShaderStage = [this](size_t index) -> IShader::E_SHADER_STAGE
-            {
-              return shaders[index].stage;
-            };
-
-          auto isValidShaderIndex = [this, getShaderStage](size_t index, IShader::E_SHADER_STAGE expectedStage, bool is_unused_shader_forbidden) -> bool
-            {
-              if (index == SShaderGroupsParams::SIndex::Unused)
-                return !is_unused_shader_forbidden;
-              if (index >= shaders.size())
-                return false;
-              if (getShaderStage(index) != expectedStage)
-                return false;
-              return true;
-            };
-
-          if (!isValidShaderIndex(shaderGroups.raygen.index, IShader::E_SHADER_STAGE::ESS_RAYGEN, true))
-          {
-            return false;
-          }
-
-          for (const auto& shaderGroup : shaderGroups.hits)
-          {
-            // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
-            if (!isValidShaderIndex(shaderGroup.anyHit, 
-              IShader::E_SHADER_STAGE::ESS_ANY_HIT,
-              bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS)))
-              return false;
-
-            // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
-            if (!isValidShaderIndex(shaderGroup.closestHit, 
-              IShader::E_SHADER_STAGE::ESS_CLOSEST_HIT,
-              bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS)))
-              return false;
-
-            if (!isValidShaderIndex(shaderGroup.intersection, 
-              IShader::E_SHADER_STAGE::ESS_INTERSECTION,
-              false))
-              return false;
-          }
-
-          for (const auto& shaderGroup : shaderGroups.misses)
-          {
-            if (!isValidShaderIndex(shaderGroup.index, 
-              IShader::E_SHADER_STAGE::ESS_MISS, 
-              false))
-              return false;
-          }
-
-          for (const auto& shaderGroup : shaderGroups.callables)
-          {
-            if (!isValidShaderIndex(shaderGroup.index, IShader::E_SHADER_STAGE::ESS_CALLABLE, false))
-              return false;
-          }
-          return true;
-        }
-
-      public:
-        inline bool valid() const
-        {
-          return impl_valid([](const SpecInfo& info)->bool
-          {
-            if (!info.valid())
-              return false;
-            return false;
-          });
-        }
-
-        std::span<const SpecInfo> shaders = {};
-        SShaderGroupsParams shaderGroups;
-        SCachedCreationParams cached = {};
-        // TODO: Could guess the required flags from SPIR-V introspection of declared caps
-        core::bitflag<FLAGS> flags = FLAGS::NONE;
-    };
+  public:
 
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
 
   protected:
-    explicit IRayTracingPipeline(const SCreationParams& _params) :
-      IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(_params.layout)),
-      m_params(_params.cached),
-      m_raygenShaderGroup(_params.shaderGroups.raygen),
-      m_missShaderGroups(core::make_refctd_dynamic_array<SGeneralShaderGroupContainer>(_params.shaderGroups.misses)),
-      m_hitShaderGroups(core::make_refctd_dynamic_array<SHitShaderGroupContainer>(_params.shaderGroups.hits)),
-      m_callableShaderGroups(core::make_refctd_dynamic_array<SGeneralShaderGroupContainer>(_params.shaderGroups.callables))
+    explicit IRayTracingPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
+        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)),
+        m_params(cachedParams)
     {}
 
     SCachedCreationParams m_params;
-    SGeneralShaderGroup m_raygenShaderGroup;
-    SGeneralShaderGroupContainer m_missShaderGroups;
-    SHitShaderGroupContainer m_hitShaderGroups;
-    SGeneralShaderGroupContainer m_callableShaderGroups;
 
 };
 

From 10ec458eb572b567f82774b18bee541da566d275 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:44:10 +0700
Subject: [PATCH 096/346] Remove Subgroup related argument from IGPUPipeline

---
 include/nbl/video/IGPUPipeline.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 826026d9aa..fc4bc8d219 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -71,8 +71,6 @@ class IGPUPipelineBase {
 
             const asset::IShader* shader = nullptr;
             std::string_view entryPoint = "";
-            asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
-            uint8_t requireFullSubgroups : 1 = false;
 
             // Container choice implicitly satisfies:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911

From 39904f7d86c251491969619cb1a338618399dda2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:44:46 +0700
Subject: [PATCH 097/346] Refactor IGPUComputePipeline to use IComputePipeline

---
 include/nbl/video/IGPUComputePipeline.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 42503e1f12..065c567ee2 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -6,6 +6,7 @@
 
 
 #include "nbl/asset/IPipeline.h"
+#include "nbl/asset/IComputePipeline.h"
 
 #include "nbl/video/IGPUPipeline.h"
 #include "nbl/video/SPipelineCreationParams.h"
@@ -14,9 +15,9 @@
 namespace nbl::video
 {
 
-class IGPUComputePipeline : public IGPUPipeline<asset::IPipeline<const IGPUPipelineLayout>>
+class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IGPUPipelineLayout>>
 {
-        using pipeline_t = asset::IPipeline<const IGPUPipelineLayout>;
+        using pipeline_t = asset::IComputePipeline<const IGPUPipelineLayout>;
 
     public:
         struct SCreationParams final : SPipelineCreationParams<const IGPUComputePipeline>

From 2ce032f87550e3d1a57a638696d8cae62bee53d6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:45:28 +0700
Subject: [PATCH 098/346] Refactor IGPURayTracingPipeline to use new
 SShaderSpecInfo scheme

---
 include/nbl/video/IGPURayTracingPipeline.h | 145 ++++++++++++++++++++-
 1 file changed, 143 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index c41ed333a1..2a6701c9e6 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -15,6 +15,147 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
         using pipeline_t = asset::IRayTracingPipeline<const IGPUPipelineLayout>;
 
     public:
+        struct SCreationParams
+        {
+          #define base_flag(F) static_cast<uint64_t>(IPipelineBase::FLAGS::F)
+          enum class FLAGS : uint64_t
+          {
+              NONE = base_flag(NONE),
+              DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
+              ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
+              FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
+              EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
+              SKIP_BUILT_IN_PRIMITIVES = 1<<12,
+              SKIP_AABBS = 1<<13,
+              NO_NULL_ANY_HIT_SHADERS = 1<<14,
+              NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
+              NO_NULL_MISS_SHADERS = 1<<16,
+              NO_NULL_INTERSECTION_SHADERS = 1<<17,
+              ALLOW_MOTION = 1<<20,
+          };
+          #undef base_flag
+
+          protected:
+              template<typename ExtraLambda>
+              inline bool impl_valid(ExtraLambda&& extra) const
+              {
+                  if (!m_layout) return false;
+
+                  for (const auto info : shaders)
+                  {
+                      if (info.shader)
+                      {
+                          if (!extra(info))
+                            return false;
+                          const auto stage = info.stage;
+                          if ((stage & ~hlsl::ShaderStage::ESS_ALL_RAY_TRACING) != 0)
+                            return false;
+                          if (!std::has_single_bit<std::underlying_type_t<hlsl::ShaderStage>>(stage))
+                            return false;
+                      }
+                      else
+                      {
+                          // every shader must not be null. use SIndex::Unused to represent unused shader.
+                          return false;
+                      }
+                  }
+
+                  auto getShaderStage = [this](size_t index) -> hlsl::ShaderStage
+                  {
+                      return shaders[index].stage;
+                  };
+
+                auto isValidShaderIndex = [this, getShaderStage](size_t index, hlsl::ShaderStage expectedStage, bool is_unused_shader_forbidden) -> bool
+                  {
+                    if (index == SShaderGroupsParams::SIndex::Unused)
+                      return !is_unused_shader_forbidden;
+                    if (index >= shaders.size())
+                      return false;
+                    if (getShaderStage(index) != expectedStage)
+                      return false;
+                    return true;
+                  };
+
+                if (!isValidShaderIndex(shaderGroups.raygen.index, hlsl::ShaderStage::ESS_RAYGEN, true))
+                {
+                  return false;
+                }
+
+                for (const auto& shaderGroup : shaderGroups.hits)
+                {
+                  // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
+                  if (!isValidShaderIndex(shaderGroup.anyHit, 
+                    hlsl::ShaderStage::ESS_ANY_HIT,
+                    bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS)))
+                    return false;
+
+                  // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
+                  if (!isValidShaderIndex(shaderGroup.closestHit, 
+                    hlsl::ShaderStage::ESS_CLOSEST_HIT,
+                    bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS)))
+                    return false;
+
+                  if (!isValidShaderIndex(shaderGroup.intersection, 
+                    hlsl::ShaderStage::ESS_INTERSECTION,
+                    false))
+                    return false;
+                }
+
+                for (const auto& shaderGroup : shaderGroups.misses)
+                {
+                  if (!isValidShaderIndex(shaderGroup.index, 
+                    hlsl::ShaderStage::ESS_MISS, 
+                    false))
+                    return false;
+                }
+
+                for (const auto& shaderGroup : shaderGroups.callables)
+                {
+                  if (!isValidShaderIndex(shaderGroup.index, hlsl::ShaderStage::ESS_CALLABLE, false))
+                    return false;
+                }
+                return true;
+              }
+
+          public:
+            inline bool valid() const
+            {
+              return impl_valid([](const SShaderSpecInfo& info)->bool
+              {
+                if (!info.valid())
+                  return false;
+                return false;
+              });
+            }
+
+            struct SShaderGroupsParams
+            {
+                struct SHitGroup
+                {
+                    SShaderSpecInfo closestHit;
+                    SShaderSpecInfo anyHit;
+                    SShaderSpecInfo intersection;
+                };
+
+                SShaderSpecInfo raygen;
+                std::span<SShaderSpecInfo> misses;
+                std::span<SHitGroup> hits;
+                std::span<SShaderSpecInfo> callables;
+
+                inline uint32_t getShaderGroupCount() const
+                {
+                    return 1 + hits.size() + misses.size() + callables.size();
+                }
+
+            };
+
+            SShaderGroupsParams shaderGroups;
+
+            SCachedCreationParams cached = {};
+            // TODO: Could guess the required flags from SPIR-V introspection of declared caps
+            core::bitflag<FLAGS> flags = FLAGS::NONE;
+        };
+
 
         struct SShaderGroupHandle
         {
@@ -62,7 +203,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                     .count=0,
                     .dataSize=0,
                 };
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool
+                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const SShaderSpecInfo& info)->bool
                 {
                     const auto dataSize = info.valid();
                     if (dataSize<0)
@@ -81,7 +222,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                 return retval;
             }
 
-            inline std::span<const spec_info_t> getShaders() const { return shaders; }
+            inline std::span<const SShaderSpecInfo> getShaders() const { return shaders; }
 
             IGPUPipelineLayout* layout = nullptr;
         };

From 058657b8defebb5eff9ea56431bd1f9e20ffc4b2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 13 May 2025 15:45:42 +0700
Subject: [PATCH 099/346] Restore deleted comments

---
 include/nbl/video/SPipelineCreationParams.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h
index 969559d941..489bff4343 100644
--- a/include/nbl/video/SPipelineCreationParams.h
+++ b/include/nbl/video/SPipelineCreationParams.h
@@ -49,7 +49,7 @@ struct SPipelineCreationParams
 		return basePipelineIndex!=NotDerivingFromPreviousPipeline || basePipeline;
 	}
 
-  
+	// If you set this, then we don't take `basePipelineIndex` into account, the pointer takes precedence
 	const PipelineType* basePipeline = nullptr;
 	int32_t basePipelineIndex = NotDerivingFromPreviousPipeline;
 };

From 55703e5ee459bde1858a93a03bd046c4ad7a3cb6 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 13 May 2025 10:57:40 +0200
Subject: [PATCH 100/346] mark off what's been implemented

---
 .../utilities/IGPUObjectFromAssetConverter.h  | 163 ------------------
 1 file changed, 163 deletions(-)

diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
index 600197611b..b7ffc5d0c1 100644
--- a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
+++ b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
@@ -11,128 +11,6 @@
 #include "nbl/video/ILogicalDevice.h"
 
 #if 0
-auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure** _begin, const asset::ICPUAccelerationStructure** _end, SParams& _params) -> created_gpu_object_array<asset::ICPUAccelerationStructure>
-{
-	const size_t assetCount = std::distance(_begin, _end);
-	auto res = core::make_refctd_dynamic_array<created_gpu_object_array<asset::ICPUAccelerationStructure> >(assetCount);
-	auto toCreateAndBuild = std::vector<const asset::ICPUAccelerationStructure*>();
-    auto buildRangeInfos = std::vector<IGPUAccelerationStructure::BuildRangeInfo*>();
-    toCreateAndBuild.reserve(assetCount);
-    buildRangeInfos.reserve(assetCount);
-    // Lambda function: creates the acceleration structure and It's buffer
-    auto allocateBufferAndCreateAccelerationStructure = [&](size_t asSize, const asset::ICPUAccelerationStructure* cpuas)
-    {
-        // Create buffer with cpuas->getAccelerationStructureSize
-        IGPUBuffer::SCreationParams gpuBufParams = {};
-        gpuBufParams.size = asSize;
-        gpuBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-        auto gpubuf = _params.device->createBuffer(std::move(gpuBufParams));
-        auto mreqs = gpubuf->getMemoryReqs();
-        mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-        auto gpubufMem = _params.device->allocate(mreqs, gpubuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-        assert(gpubufMem.isValid());
-
-        // Create GPUAccelerationStructure with that buffer
-        IGPUAccelerationStructure::SCreationParams creatationParams = {};
-        creatationParams.bufferRange.buffer = gpubuf;
-        creatationParams.bufferRange.offset = 0;
-        creatationParams.bufferRange.size = asSize;
-        creatationParams.flags = cpuas->getCreationParameters().flags;
-        creatationParams.type = cpuas->getCreationParameters().type;
-        return _params.device->createAccelerationStructure(std::move(creatationParams));
-    };
-
-    for (ptrdiff_t i = 0u; i < assetCount; ++i)
-    {
-        const asset::ICPUAccelerationStructure* cpuas = _begin[i];
-
-        if(cpuas->hasBuildInfo())
-        {
-            // Add to toBuild vector of ICPUAccelerationStructure
-            toCreateAndBuild.push_back(cpuas);
-            buildRangeInfos.push_back(const_cast<IGPUAccelerationStructure::BuildRangeInfo*>(cpuas->getBuildRanges().begin()));
-        }
-        else if(cpuas->getAccelerationStructureSize() > 0)
-        {
-            res->operator[](i) = allocateBufferAndCreateAccelerationStructure(cpuas->getAccelerationStructureSize(), cpuas);
-        }
-    }
-
-    if(toCreateAndBuild.empty() == false)
-    {
-        bool hostBuildCommands = false; // get from SFeatures
-        if(hostBuildCommands)
-        {
-            _NBL_TODO();
-        }
-        else
-        {
-            core::vector<const asset::ICPUBuffer*> cpuBufferDeps;
-            constexpr uint32_t MaxGeometryPerBuildInfo = 16;
-            constexpr uint32_t MaxBuffersPerGeometry = 3; // TrianglesData ->  vertex+index+transformation
-            cpuBufferDeps.reserve(assetCount * MaxGeometryPerBuildInfo * MaxBuffersPerGeometry);
-
-            // Get CPUBuffer Dependencies
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i];
-            
-                auto buildInfo = cpuas->getBuildInfo();
-                assert(buildInfo != nullptr);
-
-                auto geoms = buildInfo->getGeometries().begin();
-                auto geomsCount = buildInfo->getGeometries().size();
-                if(geomsCount == 0)
-                {
-                    assert(false);
-                    continue;
-                }
-
-                for(uint32_t g = 0; g < geomsCount; ++g) 
-                {
-                    const auto& geom = geoms[g];
-                    if(geom.type == asset::IAccelerationStructure::EGT_TRIANGLES)
-                    {
-                        if(geom.data.triangles.indexData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.indexData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                        if(geom.data.triangles.vertexData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.vertexData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                        if(geom.data.triangles.transformData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.transformData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                    else if(geom.type == asset::IAccelerationStructure::EGT_AABBS)
-                    {
-                        if(geom.data.aabbs.data.isValid())
-                        {
-                            auto cpuBuf = geom.data.aabbs.data.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                    else if(geom.type == asset::IAccelerationStructure::EGT_INSTANCES)
-                    {
-                        if(geom.data.instances.data.isValid())
-                        {
-                            auto cpuBuf = geom.data.instances.data.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                }
-            }
-
             // Convert CPUBuffer Deps to GPUBuffers
             core::vector<size_t> redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps);
             auto gpuBufs = getGPUObjectsFromAssets<asset::ICPUBuffer>(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params);
@@ -285,47 +163,6 @@ auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure
                 auto & gpuBuildInfo = buildGeomInfos[i];
                 gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf;
             }
-
-            // Record CommandBuffer for Building (We have Completed buildInfos + buildRanges for each CPUAS)
-            auto & fence = _params.fences[EQU_COMPUTE];
-            fence = _params.device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
-            core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf = _params.perQueue[EQU_COMPUTE].cmdbuf;
-
-            IQueue::SSubmitInfo submit;
-            {
-                submit.commandBufferCount = 1u;
-                submit.commandBuffers = &cmdbuf.get();
-                submit.waitSemaphoreCount = 0u;
-                submit.pWaitDstStageMask = nullptr;
-                submit.pWaitSemaphores = nullptr;
-                uint32_t waitSemaphoreCount = 0u;
-            }
-            
-            assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING);
-            cmdbuf->buildAccelerationStructures({buildGeomInfos.data(),buildGeomInfos.data()+buildGeomInfos.size()},buildRangeInfos.data());
-            cmdbuf->end();
-
-            // TODO for future to make this function more sophisticated: Compaction, MemoryLimit for Build
-
-            core::smart_refctd_ptr<IGPUSemaphore> sem;
-            
-            if (_params.perQueue[EQU_COMPUTE].semaphore)
-                sem = _params.device->createSemaphore();
-
-            auto* sem_ptr = sem.get();
-            auto* fence_ptr = fence.get();
-
-            submit.signalSemaphoreCount = sem_ptr?1u:0u;
-            submit.pSignalSemaphores = sem_ptr?&sem_ptr:nullptr;
-
-            _params.perQueue[EQU_COMPUTE].queue->submit(1u, &submit, fence_ptr);
-            if (_params.perQueue[EQU_COMPUTE].semaphore)
-                _params.perQueue[EQU_COMPUTE].semaphore[0] = std::move(sem);
-        }
-    }
-
-    return res;
-}
 #endif
 
 #endif

From c4aefda23a1106dc8f18e14a6896dffcd9a4bc4c Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 13 May 2025 11:19:05 +0200
Subject: [PATCH 101/346] protect against `IPreHashed` assets which don't have
 a valid precomputed hash

---
 src/nbl/video/utilities/CAssetConverter.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index de72e2f360..d678159511 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1153,6 +1153,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBottomLevel
 	hasher << lookup.patch->hostBuild;
 	hasher << lookup.patch->compactAfterBuild;
 	// finally the contents
+	if (lookup.asset->getContentHash()==NoContentHash)
+		return false;
 	hasher << lookup.asset->getContentHash();
 	return true;
 }
@@ -1232,6 +1234,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUImage> look
 		creationFlags |= create_flags_t::ECF_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT;
 	hasher << creationFlags;
 	// finally the contents
+	if (lookup.asset->getContentHash()==NoContentHash)
+		return false;
 	hasher << lookup.asset->getContentHash();
 	return true;
 }
@@ -1335,6 +1339,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUPipelineCac
 		if (entry.first.meta)
 			hasher.update(entry.first.meta->data(),entry.first.meta->size());
 	}
+	if (lookup.asset->getContentHash()==NoContentHash)
+		return false;
 	hasher << lookup.asset->getContentHash();
 	return true;
 }

From 1c0e72efdf18c17c474e6494a3850f3f132afbcb Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 14 May 2025 15:28:55 +0700
Subject: [PATCH 102/346] split config into new file

---
 examples_tests                                |  2 +-
 .../nbl/builtin/hlsl/subgroup2/ballot.hlsl    | 13 +++
 .../nbl/builtin/hlsl/workgroup2/config.hlsl   | 88 +++++++++++++++++++
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 86 ++----------------
 4 files changed, 111 insertions(+), 78 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/workgroup2/config.hlsl

diff --git a/examples_tests b/examples_tests
index 20011f5fdd..4a951b307b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 20011f5fdd3e8454bb830ded6f4221ec75036809
+Subproject commit 4a951b307b09ecf4a054f7ac27d4dac01f5e8fb9
diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
index 724887b995..6c7ec4f593 100644
--- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
+++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
@@ -11,6 +11,19 @@ namespace hlsl
 namespace subgroup2
 {
 
+uint32_t LastSubgroupInvocation()
+{
+    // why this code was wrong before:
+    // - only compute can use SubgroupID
+    // - but there's no mapping of InvocationID to SubgroupID and Index
+    return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true));
+}
+
+bool ElectLast()
+{
+    return glsl::gl_SubgroupInvocationID()==LastSubgroupInvocation();
+}
+
 template<uint32_t SubgroupSizeLog2>
 struct Configuration
 {
diff --git a/include/nbl/builtin/hlsl/workgroup2/config.hlsl b/include/nbl/builtin/hlsl/workgroup2/config.hlsl
new file mode 100644
index 0000000000..7855cc1701
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/config.hlsl
@@ -0,0 +1,88 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl 
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+namespace impl
+{
+template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
+struct virtual_wg_size_log2
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
+};
+
+template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
+struct items_per_invocation
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
+};
+}
+
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
+struct Configuration
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
+    static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
+
+    // must have at least enough level 0 outputs to feed a single subgroup
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << SubgroupsPerVirtualWorkgroupLog2;
+
+    using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
+    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>;
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
+    static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SharedMemSize = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1;
+};
+
+// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096
+// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
+#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\
+struct Configuration<11, 4, ITEMS_PER_INVOC>\
+{\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = 128u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = 1u;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = 1u;\
+};\
+
+SPECIALIZE_CONFIG_CASE_2048_16(1)
+SPECIALIZE_CONFIG_CASE_2048_16(2)
+SPECIALIZE_CONFIG_CASE_2048_16(4)
+
+}
+}
+}
+
+#undef SPECIALIZE_CONFIG_CASE_2048_16
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 1abd9cccd2..b03120b5f6 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -4,88 +4,20 @@
 #ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
 #define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
 
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/ballot.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/ballot.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/mpl.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/config.hlsl"
 
-namespace nbl 
+namespace nbl
 {
 namespace hlsl
 {
 namespace workgroup2
 {
 
-namespace impl
-{
-template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
-struct virtual_wg_size_log2
-{
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
-};
-
-template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
-struct items_per_invocation
-{
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
-};
-}
-
-template<uint32_t WorkgroupSizeLog2, uint32_t _SubgroupSizeLog2, uint32_t _ItemsPerInvocation>
-struct Configuration
-{
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2);
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
-    static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
-
-    // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 0x1u << SubgroupsPerVirtualWorkgroupLog2;
-
-    using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
-    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>;
-    // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
-    static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
-
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemSize = conditional_value<LevelCount==3,uint32_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1;
-};
-
-// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096
-// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
-#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\
-struct Configuration<11, 4, ITEMS_PER_INVOC>\
-{\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 128u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = 1u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = 1u;\
-};\
-
-SPECIALIZE_CONFIG_CASE_2048_16(1)
-SPECIALIZE_CONFIG_CASE_2048_16(2)
-SPECIALIZE_CONFIG_CASE_2048_16(4)
-
-#undef SPECIALIZE_CONFIG_CASE_2048_16
-
-
 namespace impl
 {
 
@@ -171,7 +103,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = reduction0(scan_local[idx]);
-            if (subgroup::ElectLast())
+            if (subgroup2::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -228,7 +160,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
-            if (subgroup::ElectLast())
+            if (subgroup2::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -304,7 +236,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = reduction0(scan_local[idx]);
-            if (subgroup::ElectLast())
+            if (subgroup2::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -322,7 +254,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
-            if (subgroup::ElectLast())
+            if (subgroup2::ElectLast())
             {
                 const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
                 scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
@@ -380,7 +312,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
-            if (subgroup::ElectLast())
+            if (subgroup2::ElectLast())
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -399,7 +331,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = inclusiveScan1(lv1_val);
-            if (subgroup::ElectLast())
+            if (subgroup2::ElectLast())
             {
                 const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
                 scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);

From a8794023e368990182b498ccbe5328187fe2662e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 11:30:46 +0200
Subject: [PATCH 103/346] add more debug for @kept_secret

---
 src/nbl/video/utilities/CAssetConverter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index d678159511..ea4dbf8b0f 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2918,10 +2918,10 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							}
 						}
 					}
-					if (!buildSize)
+					if (buildSize==0 || sizes.buildScratchSize==0)
 					{
 						inputs.logger.log(
-							"Build Size Input is 0 for Acceleration Structure %8llx%8llx%8llx%8llx",
+							"Build Size Input is 0 or failed the call to `ILogicalDevice::getAccelerationStructureBuildSizes` for Acceleration Structure %8llx%8llx%8llx%8llx",
 							system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
 						);
 						continue;

From 61e44254917c3432d698c64545e88c29f8e4fa00 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 11:33:53 +0200
Subject: [PATCH 104/346] got the BLAS build size query CPU vs GPU input buffer
 parameters wrong way around

---
 src/nbl/video/utilities/CAssetConverter.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index ea4dbf8b0f..1f28c3ac0f 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2862,15 +2862,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								const auto geoms = as->getAABBGeometries();
 								if (patch.hostBuild)
 								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
 								}
 								else
 								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
 								}
@@ -2885,15 +2885,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								const auto geoms = as->getTriangleGeometries();
 								if (patch.hostBuild)
 								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
 								}
 								else
 								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
+									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
+										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
 									};
 									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
 								}

From d3ff417cc616d4560eb0979ad242274f9cd5a2b6 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 14 May 2025 16:42:44 +0700
Subject: [PATCH 105/346] as fixes to asset converter

---
 examples_tests                                | 2 +-
 include/nbl/asset/ICPUAccelerationStructure.h | 2 +-
 include/nbl/video/asset_traits.h              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples_tests b/examples_tests
index 8c76367c1c..16b7349f55 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb
+Subproject commit 16b7349f55344cafc8ec9ab28ce72e129fe938bd
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 9c9af32f7b..a2229309b5 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -140,7 +140,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 
 		inline core::blake3_hash_t computeContentHash() const override
 		{
-			if (!missingContent())
+			if (missingContent())
 				return INVALID_HASH;
 			const bool isAABB = m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT);
 			core::blake3_hasher hasher;
diff --git a/include/nbl/video/asset_traits.h b/include/nbl/video/asset_traits.h
index 77bab76f64..442060d879 100644
--- a/include/nbl/video/asset_traits.h
+++ b/include/nbl/video/asset_traits.h
@@ -194,7 +194,7 @@ struct asset_traits<asset::ICPUBottomLevelAccelerationStructure>
 	// the asset type
 	using asset_t = asset::ICPUBottomLevelAccelerationStructure;
 	// we don't need to descend during DFS into other assets
-	constexpr static inline bool HasChildren = true;
+	constexpr static inline bool HasChildren = false;
 	// the video type
 	using video_t = IGPUBottomLevelAccelerationStructure;
 	// lookup type

From 14320663adabe36dfe8e9d3aaef69f609250dc8c Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 13:00:50 +0200
Subject: [PATCH 106/346] fix passing QueryOnly for the triangle version of
 `getVkASGeometryFrom`

start some light validation code in `ILogicalDevice::getAccelerationStructureBuildSizes` for BLASes
---
 include/nbl/video/ILogicalDevice.h           | 8 ++++++++
 src/nbl/video/CVulkanAccelerationStructure.h | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index f2998d8e8c..93aa965416 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -455,6 +455,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             uint32_t primsFree = limits.maxAccelerationStructurePrimitiveCount;
 			for (auto i=0u; i<geometries.size(); i++)
             {
+                if constexpr (std::is_same_v<IGPUBottomLevelAccelerationStructure::Triangles<const Geometry::buffer_t>,Geometry>)
+                {
+                    // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity?
+                }
+                if constexpr (std::is_same_v<IGPUBottomLevelAccelerationStructure::AABBs<const Geometry::buffer_t>,Geometry>)
+                {
+                    // TODO: check stride and geometry flags for validity?
+                }
                 if (pMaxPrimitiveCounts[i] > primsFree)
                 {
                     NBL_LOG_ERROR("Primitive count exceeds device limit");
diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h
index 6b94f9cad7..eb1e0534fe 100644
--- a/src/nbl/video/CVulkanAccelerationStructure.h
+++ b/src/nbl/video/CVulkanAccelerationStructure.h
@@ -134,7 +134,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 	if (!triangles.hasTransform())
 		outBase.geometry.triangles.transformData = NullAddress;
 	else if (QueryOnly)
-		outBase.geometry.triangles.transformData = DummyNonNullAddress;
+		outBase.geometry.triangles.transformData = QueryOnly ? DummyNonNullAddress:getVkDeviceOrHostAddress<const BufferType>(triangles.transform);
 	else
 	{
 		if constexpr (triangles.Host)
@@ -147,7 +147,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 template<Buffer BufferType, bool QueryOnly=false>
 void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<const BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion)
 {
-	getVkASGeometryFrom<const BufferType>(triangles,outBase);
+	getVkASGeometryFrom<const BufferType,QueryOnly>(triangles,outBase);
 	if (triangles.vertexData[1].buffer)
 	{
 		p_vertexMotion->sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_MOTION_TRIANGLES_DATA_NV;

From 5290d656649419e2334a5b8569ccf850157ef80b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 13:01:45 +0200
Subject: [PATCH 107/346] incorrect refactor revert

---
 src/nbl/video/CVulkanAccelerationStructure.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h
index eb1e0534fe..b6c06f158d 100644
--- a/src/nbl/video/CVulkanAccelerationStructure.h
+++ b/src/nbl/video/CVulkanAccelerationStructure.h
@@ -134,7 +134,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 	if (!triangles.hasTransform())
 		outBase.geometry.triangles.transformData = NullAddress;
 	else if (QueryOnly)
-		outBase.geometry.triangles.transformData = QueryOnly ? DummyNonNullAddress:getVkDeviceOrHostAddress<const BufferType>(triangles.transform);
+		outBase.geometry.triangles.transformData = DummyNonNullAddress;
 	else
 	{
 		if constexpr (triangles.Host)

From 6d8b728d048281c95550e5d0e11be6dae32f53ba Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 14 May 2025 13:29:26 +0200
Subject: [PATCH 108/346] update cmake/submodules/update.cmake, respect private
 submodules with git config on fly; update examples_tests submodule, private
 submodule is excluded from recurse update by default from now

---
 cmake/submodules/update.cmake | 23 +++++++++++++++++++----
 examples_tests                |  2 +-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake
index 76e3603980..d0365c72ca 100644
--- a/cmake/submodules/update.cmake
+++ b/cmake/submodules/update.cmake
@@ -8,6 +8,9 @@ option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "Turn this ON to attempt to upda
 option(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL "Turn this ON to prevent CMake from executing git submodules update or sync in a separate shell - be aware that the interaction with shell will be impossible in case of paraphrase prompt request of your key!" ON)
 option(NBL_CI_GIT_SUBMODULES_SHALLOW "" OFF)
 
+# TODO: replace all of this command recording & proxy logic with executing single recurse one-liner including -c options for private submodules
+# once we have relative URLs + all .gitmodules configs are polished (so basically we don't have to set some config options on fly)
+
 if(NOT DEFINED NBL_ROOT_PATH)
 	get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
 endif()
@@ -26,6 +29,18 @@ endif()
 
 function(NBL_UPDATE_SUBMODULES)
 	ProcessorCount(_GIT_SUBMODULES_JOBS_AMOUNT_)
+
+	set(PRIVATE_SUBMODULES
+		Ditt-Reference-Scenes
+	)
+
+	foreach(NBL_P_SUBMODULE_NAME ${PRIVATE_SUBMODULES})
+		if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
+			list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=checkout")
+		else()
+			list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=none")
+		endif()
+	endforeach()
 	
 	if(NBL_CI_GIT_SUBMODULES_SHALLOW)
 		set(NBL_SHALLOW "--depth=1")
@@ -54,9 +69,9 @@ function(NBL_UPDATE_SUBMODULES)
 		endif()
 
 		if(SHOULD_RECURSIVE)
-			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
+			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
 		else()
-			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
+			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
 		endif()
 		
 		string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "${_NBL_EXECUTE_COMMAND_}\n")
@@ -131,6 +146,7 @@ execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule up
 			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./3rdparty TRUE "${NBL_3RDPARTY_MODULES_TO_SKIP}")
 			
 			# boost's 3rdparties, special case
+			# TODO: fork boost and update .gitmodules to cover only libs we want to use
 			set(NBL_BOOST_LIBS_TO_INIT ${NBL_BOOST_LIBS} wave numeric_conversion) # wave and all of its deps, numeric_conversion is nested in conversion submodule (for some reason boostdep tool doesn't output it properly)
 			foreach(NBL_TARGET ${NBL_BOOST_LIBS_TO_INIT})
 				list(APPEND NBL_BOOST_SUBMODULES_TO_INIT ${NBL_TARGET})
@@ -153,8 +169,7 @@ execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule up
 			
 			# examples and their media
 			if(NBL_BUILD_EXAMPLES)
-				NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests FALSE "")
-				NBL_WRAPPER_COMMAND_EXCLUSIVE(examples_tests ./media FALSE "")
+				NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "")
 			endif()
 		endif()
 				
diff --git a/examples_tests b/examples_tests
index 8c76367c1c..825c73d5d8 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb
+Subproject commit 825c73d5d8307efef2488f0b6ce82b69c32855ea

From dff6f4ee1981b9a8de5bcf11e0a781c26a144fcd Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 14 May 2025 13:41:01 +0200
Subject: [PATCH 109/346] exclude 3rdparty/glTFSampleModels from default update

---
 .gitmodules | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 8edc1cead9..7ed5921c66 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -54,6 +54,7 @@
 [submodule "3rdparty/glTFSampleModels"]
 	path = 3rdparty/glTFSampleModels
 	url = git@github.com:Devsh-Graphics-Programming/glTF-Sample-Models.git
+	update = none
 [submodule "3rdparty/nbl_spirv_cross"]
 	path = 3rdparty/nbl_spirv_cross
 	url = git@github.com:devshgraphicsprogramming/SPIRV-Cross.git
@@ -116,4 +117,4 @@
 	url = git@github.com:Devsh-Graphics-Programming/libdeflate.git
 [submodule "docker/compiler-explorer"]
 	path = docker/compiler-explorer
-	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
+	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
\ No newline at end of file

From cc9f6943ea34afa6dc375dad312c2af2bcaafbcd Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 14 May 2025 13:54:41 +0200
Subject: [PATCH 110/346] update .gitmodules, allow git to allocate jobs to
 update submodules

---
 .gitmodules | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 7ed5921c66..0aacb58ffd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -117,4 +117,8 @@
 	url = git@github.com:Devsh-Graphics-Programming/libdeflate.git
 [submodule "docker/compiler-explorer"]
 	path = docker/compiler-explorer
-	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
\ No newline at end of file
+	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
+	
+[submodule]
+    # https://git-scm.com/docs/git-config#Documentation/git-config.txt-submodulefetchJobs
+    fetchJobs = 0
\ No newline at end of file

From 41ef540b1a661411e121825f345e4c5a854aefb4 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 13:59:24 +0200
Subject: [PATCH 111/346] I forgot to make the deferredly created BLASes and
 TLASes write to the post GPU object creation output array.

Also found that I stored a lot of stuff redundantly in the `DeferredASCreationParams`
---
 src/nbl/video/utilities/CAssetConverter.cpp | 121 ++++++++++----------
 1 file changed, 62 insertions(+), 59 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 1f28c3ac0f..0d76f2868b 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2759,10 +2759,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// BLAS and TLAS creation is somewhat delayed by buffer creation and allocation
 		struct DeferredASCreationParams
 		{
-			const IAccelerationStructure* canonical;
 			asset_cached_t<ICPUBuffer> storage = {};
-			uint64_t patchIx = 0;
-			uint64_t uniqueCopyGroupID = 0;
 			uint64_t scratchSize = 0;
 			uint64_t buildSize = 0;
 		};
@@ -2931,7 +2928,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 
 					// we need to save the buffer in a side-channel for later
 					auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
-					out.canonical = as;
 					// this is where it gets a bit weird, we need to create a buffer to back the acceleration structure
 					{
 						IGPUBuffer::SCreationParams params = {};
@@ -2950,8 +2946,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								continue;
 						}
 					}
-					out.patchIx = patchIx;
-					out.uniqueCopyGroupID = uniqueCopyGroupID;
 					out.scratchSize = sizes.buildScratchSize;
 					out.buildSize = buildSize;
 				}
@@ -3386,7 +3380,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 
 			// clear what we don't need
-			conversionRequests.gpuObjUniqueCopyGroupIDs.clear();
+			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType>)
+				conversionRequests.gpuObjUniqueCopyGroupIDs.clear();
 			// This gets deferred till AFTER the Buffer Memory Allocations and Binding
 			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType> && !std::is_base_of_v<IDeviceMemoryBacked,typename asset_traits<AssetType>::video_t>)
 			{
@@ -3418,7 +3413,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			bufferConversions.propagateToCaches(std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBuffer>>(retval.m_stagingCaches));
 			// Deal with Deferred Creation of Acceleration structures
 			{
-				auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
+				auto createAccelerationStructures = [&]<typename AccelerationStructure>(conversions_t<AccelerationStructure>& requests)->void
 				{
 					constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
 					//
@@ -3428,63 +3423,70 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					else
 						pConversions = retval.m_blasConversions;
 					// we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created
-					for (size_t i=0; i<accelerationStructureParams[IsTLAS].size(); i++)
-					if (const auto& deferredParams=accelerationStructureParams[IsTLAS][i]; deferredParams.storage)
+					for (auto& entry : requests.contentHashToCanonical)
+					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
-						const auto canonical = static_cast<const AccelerationStructure*>(deferredParams.canonical);
-						const auto& dfsNode = std::get<dfs_cache<AccelerationStructure>>(dfsCaches).nodes[deferredParams.patchIx];
-						const auto& patch = dfsNode.patch;
-						// create the AS
-						const auto bufSz = deferredParams.storage.get()->getSize();
-						IGPUAccelerationStructure::SCreationParams baseParams;
-						{
-							using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
-							baseParams = {
-								.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
-								.flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE
-							};
-						}
-						smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
-						CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap;
-						if constexpr (IsTLAS)
-						{
-							// check if the BLASes we want to use for the instances were successfully allocated and created
-							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
-								{inputs,dfsCaches,&blasInstanceMap},
-								{canonical,deferredParams.uniqueCopyGroupID},
-								patch
-							};
-							if (!visitor())
-                            {
-                                inputs.logger.log(
-                                    "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx",
-                                    system::ILogger::ELL_ERROR//,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-                                );
-                                continue;
-                            }
-							as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances});
-						}
-						else
-							as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
-						if (!as)
+						const auto reqIx = entry.second.firstCopyIx+i;
+						if (const auto& deferredParams=accelerationStructureParams[IsTLAS][reqIx]; deferredParams.storage)
 						{
-							inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
-							continue;
+							const auto* canonical = entry.second.canonicalAsset;
+							const auto& dfsNode = std::get<dfs_cache<AccelerationStructure>>(dfsCaches).nodes[entry.second.patchIndex.value];
+							const auto& patch = dfsNode.patch;
+							// create the AS
+							const auto bufSz = deferredParams.storage.get()->getSize();
+							IGPUAccelerationStructure::SCreationParams baseParams;
+							{
+								using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
+								baseParams = {
+									.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
+									.flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE
+								};
+							}
+							smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
+							CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap;
+							if constexpr (IsTLAS)
+							{
+								// check if the BLASes we want to use for the instances were successfully allocated and created
+								AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
+									{inputs,dfsCaches,&blasInstanceMap},
+									{canonical,requests.gpuObjUniqueCopyGroupIDs[reqIx]},
+									patch
+								};
+								if (!visitor())
+								{
+									const auto hashAsU64 = reinterpret_cast<const uint64_t*>(entry.first.data);
+									inputs.logger.log(
+										"Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx",
+										system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+									);
+									continue;
+								}
+								as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances});
+							}
+							else
+								as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
+							if (!as)
+							{
+								inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
+								continue;
+							}
+							// file the request for conversion
+							auto& request = pConversions[patch.hostBuild][as.get()];
+							request.canonical = smart_refctd_ptr<const AccelerationStructure>(canonical);
+							request.scratchSize = deferredParams.scratchSize;
+							request.compact = patch.compactAfterBuild;
+							request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
+							request.buildSize = deferredParams.buildSize;
+							if constexpr (IsTLAS)
+								request.instanceMap = std::move(blasInstanceMap);
+							requests.assign(entry.first,entry.second.firstCopyIx,i,std::move(as));
 						}
-						// file the request for conversion
-						auto& request = pConversions[patch.hostBuild][as.get()];
-						request.canonical = smart_refctd_ptr<const AccelerationStructure>(canonical);
-						request.scratchSize = deferredParams.scratchSize;
-						request.compact = patch.compactAfterBuild;
-						request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
-						request.buildSize = deferredParams.buildSize;
-						if constexpr (IsTLAS)
-							request.instanceMap = std::move(blasInstanceMap);
 					}
+					requests.gpuObjUniqueCopyGroupIDs.clear();
 				};
-				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
+				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>(blasConversions);
 				blasConversions.propagateToCaches(std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches));
-				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
+				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>(tlasConversions);
 				tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
 			}
 			// enqueue successfully created images with data to upload for conversion
@@ -3577,6 +3579,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							retval.m_tlasConversions[i].erase(gpuObj);
 						if constexpr (std::is_same_v<AssetType,ICPUImage>)
 							retval.m_imageConversions.erase(gpuObj);
+						// TODO: erase from `retval.m_gpuObjects` as well
 						return true;
 					}
 					// still referenced, keep it around

From 310eafd491cfacf8089248b0266d25c5ad0a0f2e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 15:45:18 +0200
Subject: [PATCH 112/346] fix various typos and bugs in Asset Converter

---
 include/nbl/video/utilities/CAssetConverter.h |  3 +-
 src/nbl/video/utilities/CAssetConverter.cpp   | 35 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 01da012a0d..182b025ada 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -959,7 +959,8 @@ class CAssetConverter : public core::IReferenceCounted
 			uint32_t sampledImageBindingCount = 1<<10;
 			uint32_t storageImageBindingCount = 11<<10;
 			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
-			CAsyncSingleBufferSubAllocatorST<core::GeneralpurposeAddressAllocator<uint64_t>>* scratchForDeviceASBuild = nullptr;
+			using scratch_for_device_AS_build_t = CAsyncSingleBufferSubAllocatorST<core::GeneralpurposeAddressAllocator<uint64_t>>;
+			scratch_for_device_AS_build_t* scratchForDeviceASBuild = nullptr;
 			std::pmr::memory_resource* scratchForHostASBuild = nullptr;
 			// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
 			IDeviceMemoryAllocator* compactedASAllocator = nullptr;
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 0d76f2868b..b6f0541a3f 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2455,7 +2455,7 @@ struct conversions_t
 			const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx];
 			if constexpr (std::is_same_v<AssetType,ICPUBuffer> || std::is_same_v<AssetType,ICPUImage>)
 			{
-				const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get());
+				const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,output->value.get());
 				if (!deferredAllocator->request(output,constrainMask))
 					return;
 			}
@@ -3766,11 +3766,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
 	core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
 	// Anything to do?
-	auto reqQueueFlags = reservations.m_queueFlags;
-	if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
+	if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
 	{
 		// whether we actually get around to doing that depends on validity and success of transfers
-		const bool shouldDoSomeCompute = reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
+		const bool shouldDoSomeCompute = reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
 		auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool
 		{
 			if (!intended || !intended->valid())
@@ -3852,7 +3851,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				const auto transferFamily = params.transfer->queue->getFamilyIndex();
 				// But don't want to have to do QFOTs between Transfer and Queue Families then
 				if (transferFamily!=computeFamily)
-				if (!scratchParams.canBeUsedByQueueFamily(transferFamily))
+				if (!scratchParams.isConcurrentSharing() || !scratchParams.canBeUsedByQueueFamily(transferFamily))
 				{
 					logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily);
 					return retval;
@@ -3868,7 +3867,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!", system::ILogger::ELL_ERROR);
 					return retval;
 				}
-				reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
 			}
 		}
 		// the elusive and exotic host builds
@@ -3885,10 +3883,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		if (reservations.willCompactAS())
 		{
 			if (!params.compactedASAllocator)
-			{
-				logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR);
-				return retval;
-			}
+				logger.log("Acceleration Structures will be compacted using the ILogicalDevice as the memory allocator!", system::ILogger::ELL_WARNING);
 			// note that can't check the compacted AS allocator being large enough against `reservations.m_compactedASMaxMemory`
 		}
 
@@ -4851,7 +4846,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								break;
 							}
 							if (depInfo.wasInStaging)
-								dependsOnBLASBuilds;
+								dependsOnBLASBuilds = true;
 							instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
 						}
 						// problem with building some Dependent BLASes
@@ -4872,7 +4867,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
 						{
 							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)};
-							const auto AllocCount = as->usesMotion() ? 2:3;
+							const auto AllocCount = as->usesMotion() ? 3:2;
 							// if fail then flush and keep trying till space is made
 							for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
 							if (t==1) // don't flush right away cause allocator not defragmented yet
@@ -4902,14 +4897,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 										assert(offsetInRange%16==0);
 											
 										uint32_t bytesWritten = 0;
-										while (true)
+										while (instanceIndex<instances.size())
 										{
 											const auto& instance = instances[instanceIndex++];
 											const auto type = instance.getType();
 											const auto size = ITopLevelAccelerationStructure::getInstanceSize(type);
 											const auto newWritten = bytesWritten+size;
-											if (newWritten>=blockSize)
-												return bytesWritten;
+											if (newWritten>blockSize)
+												break;
 											auto found = instanceMap->find(instance.getBase().blas.get());
 											auto blas = found->second.get();
 											if (auto found=compactedBLASMap->find(blas); found!=compactedBLASMap->end())
@@ -4918,6 +4913,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 											dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas->getReferenceForDeviceOperations());
 											bytesWritten = newWritten;
 										}
+										return bytesWritten;
 									}
 
 									const compacted_blas_map_t* compactedBLASMap;
@@ -4994,7 +4990,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// enqueue ownership release if necessary
 						if (finalOwnerQueueFamily!=IQueue::FamilyIgnored)
 						{
-							compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size());
+							if (willCompact)
+								compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size());
 							ownershipTransfers.push_back({
 								.barrier = {
 									.dep = {
@@ -5008,7 +5005,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								.range = backingRange
 							});
 						}
-						else
+						else if (willCompact)
 							compactedOwnershipReleaseIndices.push_back(~0u);
 					}
 					// finish the last batch
@@ -5049,7 +5046,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// create and allocate backing buffers for compacted TLASes
 						core::vector<asset_cached_t<ICPUBuffer>> backingBuffers(compactions.size());
 						{
-							MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator,logger);
+							MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger);
 							// create
 							for (size_t i=0; i<compactions.size(); i++)
 							{
@@ -5182,7 +5179,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	// in the future we'll also finish host image copies
 
 	// check dependents before inserting into cache
-	if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
+	if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
 	{
 		auto checkDependents = [&]<Asset AssetType>()->void
 		{

From 5c519d095903f4cc42ad4628185b82f37ae77563 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 16:45:32 +0200
Subject: [PATCH 113/346] `core::makeRAIIExiter` is literally the best thing
 since sliced bread

Fix bugs:
- ReBAR only buffer transfers dereferencing a nullptr transfer cmbduf
- BLAS and TLAS memory allocations latching on semaphores which will never signal if the command recording fails for some reason
---
 src/nbl/video/utilities/CAssetConverter.cpp | 55 ++++++++++++++++-----
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index b6f0541a3f..bc9fac01c0 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -3991,7 +3991,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			core::vector<buffer_mem_barrier_t> finalReleases;
 			finalReleases.reserve(buffersToUpload.size());
 			// do the uploads
-			if (!buffersToUpload.empty())
+			if (!buffersToUpload.empty() && xferCmdBuf)
 			{
 				xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers START");
 				xferCmdBuf->cmdbuf->endDebugMarker();
@@ -4039,7 +4039,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						.range = range
 					});
 			}
-			if (!buffersToUpload.empty())
+			if (!buffersToUpload.empty() && xferCmdBuf)
 			{
 				xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers END");
 				xferCmdBuf->cmdbuf->endDebugMarker();
@@ -4653,6 +4653,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					return false;
 			};
 			//
+			using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
+			using addr_t = typename scratch_allocator_t::size_type;
+			core::vector<addr_t> scratchOffsets;
+			scratchOffsets.reserve(maxASCount);
+			core::vector<addr_t> scratchSizes;
+			scratchSizes.reserve(maxASCount);
 			auto recordBuildCommandsBase = [&](auto& buildInfos, auto& rangeInfos)->void
 			{
 				if (buildInfos.empty())
@@ -4665,13 +4671,25 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				bool success = !uniQueue || !deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!");
 				//
 				success = success && computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data());
-				if (!success)
-				for (const auto& info : buildInfos)
+				if (success)
 				{
-					const auto stagingFound = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
-					smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
-					markFailure("AS Build Command Recording",&dummy,&stagingFound->second);
+					submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+					// queue up a deferred allocation
+					params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data(),params.compute->getFutureScratchSemaphore());
+				}
+				else
+				{
+					// release right away
+					params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data());
+					for (const auto& info : buildInfos)
+					{
+						const auto stagingFound = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
+						smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
+						markFailure("AS Build Command Recording",&dummy,&stagingFound->second);
+					}
 				}
+				scratchOffsets.clear();
+				scratchSizes.clear();
 				buildInfos.clear();
 				rangeInfos.clear();
 			};
@@ -4813,8 +4831,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						trackedBLASes.clear();
 					};
 					//
-					using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
-					using addr_t = typename scratch_allocator_t::size_type;
 					const auto& limits = physDev->getLimits();
 					for (auto& tlasToBuild : tlasesToBuild)
 					{
@@ -4865,9 +4881,25 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						constexpr uint32_t MaxAllocCount = 3;
 						addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
 						const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
+						const auto AllocCount = as->usesMotion() ? 3:2;
+						// clean up the allocation if we fail to make it to the end of loop for whatever reason
+						bool abortAllocation = true;
+						auto deallocSrc = core::makeRAIIExiter([&params,&scratchOffsets,&scratchSizes,AllocCount,&offsets,&sizes,&abortAllocation]()->void
+							{
+								// if got to end of loop queue up the release of memory, otherwise release right away
+								if (abortAllocation)
+									params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0]);
+								else
+								for (auto i=0; i<AllocCount; i++)
+								{
+									scratchOffsets.push_back(offsets[i]);
+									scratchSizes.push_back(sizes[i]);
+								}
+							}
+						);
+						// allocate out scratch or submit overflow
 						{
 							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)};
-							const auto AllocCount = as->usesMotion() ? 3:2;
 							// if fail then flush and keep trying till space is made
 							for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
 							if (t==1) // don't flush right away cause allocator not defragmented yet
@@ -4881,8 +4913,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								}
 								drainCompute();
 							}
-							// queue up a deferred allocation
-							params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
 						}
 						// stream the instance/geometry input in
 						const size_t trackedBLASesOffset = trackedBLASes.size();
@@ -4983,6 +5013,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset};
 						// no special extra byte offset into the instance buffer
 						rangeInfos.emplace_back(instanceCount,0u);
+						abortAllocation = false;
 						//
 						const bool willCompact = tlasToBuild.second.compact;
 						if (willCompact)

From 44f241a977148b2e2b02a04bb7b57b5c6530ac7a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 14 May 2025 16:54:18 +0200
Subject: [PATCH 114/346] update & make boost submodule URL relative, update
 NBL_BOOST_GENERATE_DEP_LIST mini tool to autogen boost's .gitmodules

---
 .gitmodules                   |  2 +-
 3rdparty/boost/CMakeLists.txt | 26 +++++++++++++++++++++++++-
 3rdparty/boost/superproject   |  2 +-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 0aacb58ffd..ba078222e2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -90,7 +90,7 @@
 	url = git@github.com:Devsh-Graphics-Programming/Nabla-Continous-Integration-Python-Framework.git
 [submodule "3rdparty/boost/superproject"]
 	path = 3rdparty/boost/superproject
-	url = git@github.com:boostorg/boost.git
+	url = ../boost.git
 [submodule "3rdparty/argparse"]
 	path = 3rdparty/argparse
 	url = git@github.com:p-ranav/argparse.git
diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt
index f3460fe8d6..36e596cbf6 100644
--- a/3rdparty/boost/CMakeLists.txt
+++ b/3rdparty/boost/CMakeLists.txt
@@ -44,7 +44,7 @@ set(NBL_BOOST_TARGETS
 PARENT_SCOPE)
 
 # Boost uses it's own tool for generating dependency list for targets, therefore we 
-# can make sure manually added dependnecy subdirectories for a library are valid
+# can make sure manually added dependency subdirectories for a library are valid
 # https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep
 
 if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
@@ -83,5 +83,29 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
 	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)")
 	string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}")
 
+	# we override boost's .gitmodules to pick only those modules we really use (reported by boost's dep executable)
+	# boost hosts now like 200 repositories, some of them are really big however atm we reference around 60
+	set(BOOST_SUBMODULE_TEMPLATE 
+[=[
+
+[submodule "@NAME@"]
+	path = libs/@NAME@
+	url = ../@FLATTEN_NAME@.git
+	fetchRecurseSubmodules = on-demand
+	branch = .
+]=]
+	)
+
+	unset(BOOST_GITMODULES)
+	foreach(NAME ${NBL_BOOST_LIBS}) 
+		string(REPLACE "/" "_" FLATTEN_NAME "${NAME}")
+		string(CONFIGURE "${BOOST_SUBMODULE_TEMPLATE}" TEMPLATE)
+		string(APPEND BOOST_GITMODULES "${TEMPLATE}")
+	endforeach()
+
+	# NOTE: this you commit to version control
 	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})")
+
+	# and this one too + you update boost submodule pointer with the update!
+	file(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/.gitmodules" "${BOOST_GITMODULES}")
 endif()
diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject
index 1c4d3531e4..e1a703f795 160000
--- a/3rdparty/boost/superproject
+++ b/3rdparty/boost/superproject
@@ -1 +1 @@
-Subproject commit 1c4d3531e416a1f72b0e6a5e0f7173f93cf97e92
+Subproject commit e1a703f7956264e463329d49ab05100bdc34e219

From 340cb7511ae24f32ab53e57f79be8350ecede68f Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 17:07:25 +0200
Subject: [PATCH 115/346] the implementation of `IGPUCommandBuffer::empty()`
 was completely and utterly broken

---
 include/nbl/video/IGPUCommandBuffer.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index cfe0439cde..2584707ab6 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -93,7 +93,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
                 case STATE::EXECUTABLE:
                     [[fallthrough]];
                 case STATE::PENDING:
-                    if (m_noCommands)
+                    if (!m_noCommands)
                         return false;
                     [[fallthrough]];
                 default:
@@ -261,13 +261,21 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         inline bool buildAccelerationStructures(const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> infos, const IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos)
         {
             if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,buildRangeInfos); totalGeometryCount)
-                return buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount);
+            if (buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount))
+            {
+                m_noCommands = false;
+                return true;
+            }
             return false;
         }
         inline bool buildAccelerationStructures(const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo> infos, const IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos)
         {
             if (buildAccelerationStructures_common(infos,buildRangeInfos))
-                return buildAccelerationStructures_impl(infos,buildRangeInfos);
+            if (buildAccelerationStructures_impl(infos,buildRangeInfos))
+            {
+                m_noCommands = false;
+                return true;
+            }
             return false;
         }
         // We don't allow different indirect command addresses due to https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pIndirectDeviceAddresses-03646
@@ -300,10 +308,14 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 
             if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,maxPrimitiveOrInstanceCounts,indirectRangeBuffer); totalGeometryCount)
             {
+                bool success;
                 if constexpr(std::is_same_v<AccelerationStructure,IGPUBottomLevelAccelerationStructure>)
-                    return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount);
+                    success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount);
                 else
-                    return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts);
+                    success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts);
+                if (success)
+                    m_noCommands = false;
+                return success;
             }
             return false;
         }

From 6d793d74be629064984a6f063c2c04a73bf8158a Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 17:45:47 +0200
Subject: [PATCH 116/346] remind self to never reinterpret_cast between virtual
 base and derived

---
 include/nbl/video/IGPUCommandBuffer.h | 2 +-
 include/nbl/video/ILogicalDevice.h    | 4 ++--
 src/nbl/video/IGPUCommandBuffer.cpp   | 2 +-
 src/nbl/video/IQueue.cpp              | 6 +++++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index 2584707ab6..d5a3fac0af 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -884,7 +884,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it.
         // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes.
         // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records.
-        core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t>> m_TLASToBLASReferenceSets;
+        core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const core::smart_refctd_ptr<const IReferenceCounted>>> m_TLASToBLASReferenceSets;
 
         const IGPUGraphicsPipeline* m_boundGraphicsPipeline;
         const IGPUComputePipeline* m_boundComputePipeline;
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 93aa965416..8ad3b839ab 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -580,7 +580,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     }
 
                     // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
-                    core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t>> m_TLASToBLASReferenceSets;
+                    core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const core::smart_refctd_ptr<const IReferenceCounted>>> m_TLASToBLASReferenceSets;
                 } callback = {};
 
                 auto& tracking = deferredOperation->m_resourceTracking;
@@ -593,7 +593,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     {
                         const auto blasCount = info.trackedBLASes.size();
                         if (blasCount)
-                            callback.m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t*>(oit-blasCount),blasCount};
+                            callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
                         else
                             callback.m_TLASToBLASReferenceSets[info.dstAS] = {};
                     }
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index fcf55b74c1..6bde593097 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -843,7 +843,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
         {
             const auto blasCount = info.trackedBLASes.size();
             if (blasCount)
-                m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t*>(oit-blasCount),blasCount};
+                m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
             else
                 m_TLASToBLASReferenceSets[info.dstAS] = {};
         }
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index e761b7a733..e7612cc8d1 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -157,7 +157,11 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
         {
             const auto tlas = refSet.first;
             // in theory could assert no duplicate entries, but thats obvious
-            m_TLASToBLASReferenceSets[tlas] = { .m_BLASes = {refSet.second.begin(),refSet.second.end()}, .m_buildVer = tlas->registerNextBuildVer()};
+            auto& out = m_TLASToBLASReferenceSets[tlas];
+            out.m_BLASes.reserve(refSet.second.size());
+            for (const auto& refCtd : refSet.second)
+                out.m_BLASes.emplace(dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(refCtd.get()));
+            out.m_buildVer = tlas->registerNextBuildVer();
         }
     }
     // We don't hold the last signal semaphore, because the timeline does as an Event trigger.

From 507904f462c9fe50928b198ca2aabd7fa5c8b460 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 10:38:03 +0700
Subject: [PATCH 117/346] minor fixes

---
 examples_tests                                       |  2 +-
 include/nbl/builtin/hlsl/subgroup2/ballot.hlsl       |  9 +++++----
 .../{config.hlsl => arithmetic_config.hlsl}          |  8 ++++----
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 12 ++++++------
 4 files changed, 16 insertions(+), 15 deletions(-)
 rename include/nbl/builtin/hlsl/workgroup2/{config.hlsl => arithmetic_config.hlsl} (95%)

diff --git a/examples_tests b/examples_tests
index a42a742f36..908abd110c 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit a42a742f363bda827991794053fb93fd803023f1
+Subproject commit 908abd110c387d48110ce8aeb67f0e0f2dd68943
diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
index 6c7ec4f593..52ae6de2d9 100644
--- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
+++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
@@ -11,12 +11,13 @@ namespace hlsl
 namespace subgroup2
 {
 
+template<int32_t AssumeAllActive=false>
 uint32_t LastSubgroupInvocation()
 {
-    // why this code was wrong before:
-    // - only compute can use SubgroupID
-    // - but there's no mapping of InvocationID to SubgroupID and Index
-    return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true));
+    if (AssumeAllActive)
+        return glsl::gl_SubgroupSize()-1;
+    else
+        return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true));
 }
 
 bool ElectLast()
diff --git a/include/nbl/builtin/hlsl/workgroup2/config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
similarity index 95%
rename from include/nbl/builtin/hlsl/workgroup2/config.hlsl
rename to include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 7855cc1701..2f24c863da 100644
--- a/include/nbl/builtin/hlsl/workgroup2/config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -1,8 +1,8 @@
 // Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_
-#define _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
@@ -33,7 +33,7 @@ struct items_per_invocation
 }
 
 template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
-struct Configuration
+struct ArithmeticConfiguration
 {
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
@@ -61,7 +61,7 @@ struct Configuration
 // special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096
 // specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
 #define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\
-struct Configuration<11, 4, ITEMS_PER_INVOC>\
+struct ArithmeticConfiguration<11, 4, ITEMS_PER_INVOC>\
 {\
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index b03120b5f6..681ba39911 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -103,7 +103,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = reduction0(scan_local[idx]);
-            if (subgroup2::ElectLast())
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -160,7 +160,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
-            if (subgroup2::ElectLast())
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -236,7 +236,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = reduction0(scan_local[idx]);
-            if (subgroup2::ElectLast())
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -254,7 +254,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
-            if (subgroup2::ElectLast())
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
                 scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
@@ -312,7 +312,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         {
             dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
-            if (subgroup2::ElectLast())
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
@@ -331,7 +331,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = inclusiveScan1(lv1_val);
-            if (subgroup2::ElectLast())
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
                 scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);

From 59fcc93d2f0c0ac1b2196426e34d9ed8d9586a13 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 15 May 2025 12:45:57 +0700
Subject: [PATCH 118/346] Implement all computeDependants for IAssets

---
 include/nbl/asset/ICPUAccelerationStructure.h | 21 +++----
 include/nbl/asset/ICPUAnimationLibrary.h      | 16 +-----
 include/nbl/asset/ICPUBuffer.h                | 10 ++--
 include/nbl/asset/ICPUBufferView.h            |  9 ++-
 include/nbl/asset/ICPUComputePipeline.h       |  9 ---
 include/nbl/asset/ICPUDescriptorSet.h         |  5 +-
 include/nbl/asset/ICPUDescriptorSetLayout.h   | 15 +++--
 include/nbl/asset/ICPUImage.h                 |  7 ++-
 include/nbl/asset/ICPUImageView.h             | 10 ++--
 include/nbl/asset/ICPUMesh.h                  |  6 +-
 include/nbl/asset/ICPUMeshBuffer.h            |  9 +--
 include/nbl/asset/ICPUPipelineCache.h         |  7 ++-
 include/nbl/asset/ICPUPipelineLayout.h        | 22 +++-----
 include/nbl/asset/ICPURayTracingPipeline.h    | 14 -----
 include/nbl/asset/ICPURenderpass.h            |  6 +-
 .../asset/ICPURenderpassIndependentPipeline.h |  7 ++-
 include/nbl/asset/ICPUSampler.h               |  7 ++-
 include/nbl/asset/ICPUSkeleton.h              |  9 +--
 include/nbl/asset/IShader.h                   |  8 +--
 src/nbl/asset/ICPUDescriptorSet.cpp           | 55 +++++++++----------
 20 files changed, 107 insertions(+), 145 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 9c9af32f7b..affd165667 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -136,7 +136,10 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		}
 
 		// Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents
-		inline size_t getDependantCount() const override {return 0;}
+		inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+			return {};
+		}
 
 		inline core::blake3_hash_t computeContentHash() const override
 		{
@@ -236,8 +239,6 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 	protected:
 		virtual ~ICPUBottomLevelAccelerationStructure() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		inline void discardContent_impl() override
 		{
 			m_triangleGeoms = nullptr;
@@ -263,8 +264,13 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		//
 		ICPUTopLevelAccelerationStructure() = default;
 
-		//
-		inline size_t getDependantCount() const override {return m_instances->size();}
+    inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+			core::unordered_set<const IAsset*> dependants;
+			for (const auto& instance : m_instances)
+				dependants.insert(instance.getBase().blas.get());
+			return dependants;
+		}
 
 		//
 		inline auto& getBuildRangeInfo()
@@ -360,11 +366,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 	protected:
 		virtual ~ICPUTopLevelAccelerationStructure() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			return m_instances->operator[](ix).getBase().blas.get();
-		}
-
 	private:
 		core::smart_refctd_dynamic_array<PolymorphicInstance> m_instances = nullptr;
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 1b02787597..5fea370b63 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -96,21 +96,9 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 		constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-		inline size_t getDependantCount() const override {return 3;}
-
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override
+    inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-			switch (ix)
-			{
-				case 0:
-					return m_keyframeStorageBinding.buffer.get();
-				case 1:
-					return m_timestampStorageBinding.buffer.get();
-				default:
-					break;
-			}
-			return m_animationStorageRange.buffer.get();
+			return { m_keyframeStorageBinding.buffer.get(), m_timestampStorageBinding.buffer.get(), m_animationStorageRange.buffer.get() };
 		}
 };
 
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 5bb16bd0ac..2d495ef02e 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -75,7 +75,10 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         constexpr static inline auto AssetType = ET_BUFFER;
         inline IAsset::E_TYPE getAssetType() const override final { return AssetType; }
 
-        inline size_t getDependantCount() const override { return 0; }
+        inline core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            return {};
+        }
 
         inline core::blake3_hash_t computeContentHash() const override
         {
@@ -113,11 +116,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         }
 
 protected:
-    inline IAsset* getDependant_impl(const size_t ix) override
-    {
-        return nullptr;
-    }
-
     inline void discardContent_impl() override
     {
         if (m_data)
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index 3819136c98..7f3f676695 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -28,7 +28,10 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 		constexpr static inline auto AssetType = ET_BUFFER_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-		inline size_t getDependantCount() const override {return 1;}
+    inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+        return { m_buffer.get() };
+		}
 
 		ICPUBuffer* getUnderlyingBuffer() 
 		{
@@ -51,10 +54,6 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 	protected:
 		virtual ~ICPUBufferView() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			return m_buffer.get();
-		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 01859e0c3f..8d8b343a3d 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -36,8 +36,6 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
         inline E_TYPE getAssetType() const override { return AssetType; }
         
         //!
-        inline size_t getDependantCount() const override { return 2; }
-
         virtual core::unordered_set<const IAsset*> computeDependants() const override
         {
             return {m_layout.get(), m_specInfo.shader.get()};
@@ -62,13 +60,6 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
         using base_t::base_t;
         virtual ~ICPUComputePipeline() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            if (ix!=0)
-                return m_specInfo.shader.get();
-            return const_cast<ICPUPipelineLayout*>(m_layout.get());
-        }
-
 
     private:
         SShaderSpecInfo m_specInfo;
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 826c54cc39..77640b8f9f 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -47,8 +47,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 		constexpr static inline auto AssetType = ET_DESCRIPTOR_SET;
 		inline E_TYPE getAssetType() const override {return AssetType;}
 
-		inline size_t getDependantCount() const override {return m_layout->getTotalBindingCount()+1;}
-
 		//
 		inline ICPUDescriptorSetLayout* getLayout() 
 		{
@@ -79,10 +77,11 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override;
 
+		core::unordered_set<const IAsset*> computeDependants() const override;
+
 	protected:
 		virtual ~ICPUDescriptorSet() = default;
 
-		IAsset* getDependant_impl(size_t ix) override;
 
 	private:
 
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 8f45a789ea..2ddf1e26be 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -57,15 +57,20 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
         constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
 
-		inline size_t getDependantCount() const override {return m_immutableSamplers ? m_immutableSamplers->size():0;}
+        core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            if (!m_immutableSamplers) return {};
+            core::unordered_set<const IAsset*> dependants;
+            for (const auto& sampler: m_immutableSamplers)
+            {
+                dependants.insert(sampler.get());
+            }
+            return dependants;
+        }
 
 	protected:
 		virtual ~ICPUDescriptorSetLayout() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            return m_immutableSamplers->operator[](ix).get();
-        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index c27cd21b86..2527fd1ecb 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -46,7 +46,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
 		// Do not report buffer as dependant, as we will simply drop it instead of discarding its contents!
-		inline size_t getDependantCount() const override {return 0;}
+    inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+        return {};
+		}
 
 		core::blake3_hash_t computeContentHash() const override;
 
@@ -202,8 +205,6 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 		inline ICPUImage(const SCreationParams& _params) : IImage(_params) {}
 		virtual ~ICPUImage() = default;
 		
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		inline void discardContent_impl() override
 		{
 			buffer = nullptr;
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 87df463021..6b3d562a60 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -49,8 +49,10 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 		constexpr static inline auto AssetType = ET_IMAGE_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-		//!
-		inline size_t getDependantCount() const override {return 1;}
+    inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+        return { params.image.get() };
+		}
 
 		//!
 		const SComponentMapping& getComponents() const { return params.components; }
@@ -68,10 +70,6 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 	protected:
 		virtual ~ICPUImageView() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			return params.image.get();
-		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index a21f5f3f02..2648900ccc 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -82,10 +82,12 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
         }
 
         //! CLASS IS DEPRECATED ANYWAY
-		inline size_t getDependantCount() const override {return 0;}
+        inline core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            return {};
+        }
 
 	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
 
 	private:
 		core::vector<core::smart_refctd_ptr<ICPUMeshBuffer>> m_meshBuffers;
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 532b622090..61e9168a98 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -611,11 +611,12 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             return const_cast<core::aabbox3df*>(const_cast<const ICPUMeshBuffer*>(this)->getJointAABBs());
         }
 
-        //! CLASS IS DEPRECATED ANYWAY
-		inline size_t getDependantCount() const override {return 0;}
+        //! Class is deprecated anyway.
+        inline core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            return {};
+        }
 
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 0c1d8c17cf..6fc019ce7f 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -60,7 +60,10 @@ class ICPUPipelineCache final : public IPreHashed
 			return core::make_smart_refctd_ptr<ICPUPipelineCache>(std::move(cache_cp));
 		}
 
-		inline size_t getDependantCount() const override {return 0;}
+	  inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+			return {};
+		}
 
 		//
 		inline core::blake3_hash_t computeContentHash() const override
@@ -86,8 +89,6 @@ class ICPUPipelineCache final : public IPreHashed
 		const auto& getEntries() const {return m_cache;}
 
 	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		inline void discardContent_impl() override
 		{
 			for (auto& entry : m_cache)
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index c4a76fdea9..994d480b17 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -30,14 +30,14 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
             core::smart_refctd_ptr<ICPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<ICPUDescriptorSetLayout>&& _layout3
         ) : IPipelineLayout<ICPUDescriptorSetLayout>(_pcRanges,std::move(_layout0),std::move(_layout1),std::move(_layout2),std::move(_layout3)) {}
 
-        //
-        inline size_t getDependantCount() const override
+        inline core::unordered_set<const IAsset*> computeDependants() const override
         {
-            size_t count = 0;
-            for (auto i=0; i<m_descSetLayouts.size(); i++)
-            if (m_descSetLayouts[i])
-                count++;
-            return count;
+            core::unordered_set<const IAsset*> dependants;
+            for (auto i = 0; i < m_descSetLayouts.size(); i++)
+            {
+                if (m_descSetLayouts[i]) continue;
+                dependants.insert(m_descSetLayouts[i].get());
+            }
         }
 
         //
@@ -79,14 +79,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            size_t count = 0;
-            for (auto i=0; i<m_descSetLayouts.size(); i++)
-            if ((count++)==ix)
-                return m_descSetLayouts[ix].get();
-            return nullptr;
-        }
 };
 
 }
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 5be344d1f2..5d975fa4dc 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -56,12 +56,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        //!
-        inline size_t getDependantCount() const override { 
-            //TODO(kevinyu): Remove this function use computeDependants
-            return 0;
-        }
-
         virtual core::unordered_set<const IAsset*> computeDependants() const override final {
             core::unordered_set<const IAsset*> dependants;
             dependants.insert(m_raygen.shader.get());
@@ -103,14 +97,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
     protected:
         virtual ~ICPURayTracingPipeline() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            //TODO(kevinyu): remove this function, use computeDependants
-            assert(false);
-            return nullptr;
-        }
-
-
     private:
         
         SShaderSpecInfo m_raygen;
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index b9cf31d127..bbb2e5003f 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -38,13 +38,15 @@ class ICPURenderpass : public IRenderpass, public IAsset
             return ET_RENDERPASS;
         }
 
-        inline size_t getDependantCount() const override {return 0ull;}
+        inline core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            return {};
+        }
 
     protected:
         inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {}
         inline ~ICPURenderpass() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
 };
 
 }
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index ed0171d11f..8638a4965b 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -66,7 +66,10 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 		_NBL_STATIC_INLINE_CONSTEXPR auto AssetType = ET_RENDERPASS_INDEPENDENT_PIPELINE;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-		inline size_t getDependantCount() const override {return 0;}
+		inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+			return {};
+		}
 
 		//
 		inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();}
@@ -137,8 +140,6 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 			: IRenderpassIndependentPipeline(params), m_layout(std::move(_layout)) {}
 		virtual ~ICPURenderpassIndependentPipeline() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		core::smart_refctd_ptr<ICPUPipelineLayout> m_layout;
 #if 0
 		std::array<core::smart_refctd_ptr<IShader>,GRAPHICS_SHADER_STAGE_COUNT> m_shaders = {};
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index 27a918afaa..46cac56ee0 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -17,8 +17,6 @@ class ICPUSampler : public ISampler, public IAsset
 	protected:
 		virtual ~ICPUSampler() = default;
         
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 	public:
 		ICPUSampler(const SParams& _params) : ISampler(_params), IAsset() {}
         
@@ -71,7 +69,10 @@ class ICPUSampler : public ISampler, public IAsset
 		constexpr static inline auto AssetType = ET_SAMPLER;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 		
-		inline size_t getDependantCount() const override {return 0;}
+    inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+        return {};
+		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 6f1c576ed8..ce03a9be54 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -79,14 +79,11 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 		constexpr static inline auto AssetType = ET_SKELETON;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-		//!
-		inline size_t getDependantCount() const override {return 2;}
-
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override
+    inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-			return (ix!=0 ? m_defaultTransforms:m_parentJointIDs).buffer.get();
+        return { m_defaultTransforms.buffer.get(), m_parentJointIDs.buffer.get() };
 		}
+
 };
 
 }
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index a6dab09b54..5abd7d1980 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -50,8 +50,10 @@ class IShader : public IAsset
 		constexpr static inline auto AssetType = ET_SHADER;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 		
-		//
-		inline size_t getDependantCount() const override { return 1; }
+		inline core::unordered_set<const IAsset*> computeDependants() const override
+		{
+			return { m_code.get() };
+		}
 		
 		//
 		inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth=~0u) const override
@@ -96,8 +98,6 @@ class IShader : public IAsset
 	protected:
 		virtual ~IShader() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override {return m_code.get();}
-
 		std::string m_filepathHint;
 		core::smart_refctd_ptr<ICPUBuffer> m_code;
 		E_CONTENT_TYPE m_contentType;
diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp
index 03724be1a2..a298fea491 100644
--- a/src/nbl/asset/ICPUDescriptorSet.cpp
+++ b/src/nbl/asset/ICPUDescriptorSet.cpp
@@ -108,36 +108,35 @@ core::smart_refctd_ptr<IAsset> ICPUDescriptorSet::clone(uint32_t _depth) const
 	return cp;
 }
 
-IAsset* ICPUDescriptorSet::getDependant_impl(size_t ix)
+core::unordered_set<const IAsset*> ICPUDescriptorSet::computeDependants() const
 {
-	for (auto i=0u; i<static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
-	if (m_descriptorInfos[i])
+	core::unordered_set<const IAsset*> dependants = { m_layout.get() };
+	for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
 	{
-		const auto size = m_descriptorInfos[i]->size();
-		if (ix<size)
-		{
-			auto* desc = m_descriptorInfos[i]->operator[](ix).desc.get();
-			if (desc)
-			switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
-			{
-				case IDescriptor::EC_BUFFER:
-					return static_cast<ICPUBuffer*>(desc);
-				case IDescriptor::EC_SAMPLER:
-					return static_cast<ICPUSampler*>(desc);
-				case IDescriptor::EC_IMAGE:
-					return static_cast<ICPUImageView*>(desc);
-				case IDescriptor::EC_BUFFER_VIEW:
-					return static_cast<ICPUBufferView*>(desc);
-				case IDescriptor::EC_ACCELERATION_STRUCTURE:
-					return static_cast<ICPUTopLevelAccelerationStructure*>(desc);
-				default:
-					break;
-			}
-			return nullptr;
-		}
-		else
-			ix -= size;
+		if (!m_descriptorInfos[i]) continue;
+    const auto size = m_descriptorInfos[i]->size();
+    for (auto desc_i = 0u; desc_i < size; desc_i++)
+    {
+      auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get();
+			if (!desc) continue;
+      switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+      {
+      case IDescriptor::EC_BUFFER:
+        dependants.insert(static_cast<ICPUBuffer*>(desc));
+      case IDescriptor::EC_SAMPLER:
+        dependants.insert(static_cast<ICPUSampler*>(desc));
+      case IDescriptor::EC_IMAGE:
+        dependants.insert(static_cast<ICPUImageView*>(desc));
+      case IDescriptor::EC_BUFFER_VIEW:
+        dependants.insert(static_cast<ICPUBufferView*>(desc));
+      case IDescriptor::EC_ACCELERATION_STRUCTURE:
+        dependants.insert(static_cast<ICPUTopLevelAccelerationStructure*>(desc));
+      default:
+        break;
+      }
+    }
 	}
-	return nullptr;
+	return dependants;
 }
+
 }
\ No newline at end of file

From 542592f7c5926f601351bb1872d65e171b742440 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 14:44:10 +0700
Subject: [PATCH 119/346] soome changes to arithmetic config

---
 examples_tests                                |  2 +-
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 46 +++++++++----------
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  |  2 +-
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/examples_tests b/examples_tests
index 908abd110c..81238adaec 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 908abd110c387d48110ce8aeb67f0e0f2dd68943
+Subproject commit 81238adaecbd8d717bdab0dd73e08e2938a794c6
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 2f24c863da..d0800d6996 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -18,6 +18,8 @@ namespace impl
 template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
+    static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
+    static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
     NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
 };
@@ -30,6 +32,24 @@ struct items_per_invocation
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
 };
+
+// explicit specializations for cases that don't fit
+#define SPECIALIZE_VIRTUAL_WG_SIZE_CASE(WGLOG2, SGLOG2, LEVELS, VALUE) template<>\
+struct virtual_wg_size_log2<WGLOG2, SGLOG2>\
+{\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = LEVELS;\
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = VALUE;\
+};\
+
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(11,4,3,12);
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(7,7,1,7);
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(6,6,1,6);
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(5,5,1,5);
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(4,4,1,4);
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(3,3,1,3);
+SPECIALIZE_VIRTUAL_WG_SIZE_CASE(2,2,1,2);
+
+#undef SPECIALIZE_VIRTUAL_WG_SIZE_CASE
 }
 
 template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
@@ -39,7 +59,6 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
-    static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
 
     // must have at least enough level 0 outputs to feed a single subgroup
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
@@ -55,34 +74,11 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SharedMemSize = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value<LevelCount==1,uint16_t,0,conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupSize*ItemsPerInvocation_1>::value;
 };
 
-// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096
-// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
-#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\
-struct ArithmeticConfiguration<11, 4, ITEMS_PER_INVOC>\
-{\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = 128u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = 1u;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = 1u;\
-};\
-
-SPECIALIZE_CONFIG_CASE_2048_16(1)
-SPECIALIZE_CONFIG_CASE_2048_16(2)
-SPECIALIZE_CONFIG_CASE_2048_16(4)
-
 }
 }
 }
 
-#undef SPECIALIZE_CONFIG_CASE_2048_16
-
 #endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 681ba39911..461b685c99 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -9,7 +9,7 @@
 #include "nbl/builtin/hlsl/subgroup2/ballot.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/mpl.hlsl"
-#include "nbl/builtin/hlsl/workgroup2/config.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
 
 namespace nbl
 {

From a9930a025b4b252c1a08c4abc59cd1652cb666ac Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 16:00:34 +0700
Subject: [PATCH 120/346] removed referencing workgroupID in scans

---
 examples_tests                                |  2 +-
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 10 ++++++++
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 24 +++++++++----------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/examples_tests b/examples_tests
index 81238adaec..1de31ddfd7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 81238adaecbd8d717bdab0dd73e08e2938a794c6
+Subproject commit 1de31ddfd725009bd650f1fe80f1c4a8c2e6a14a
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index d0800d6996..88ff328e05 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -77,6 +77,16 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value<LevelCount==1,uint16_t,0,conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupSize*ItemsPerInvocation_1>::value;
 };
 
+template<class T>
+struct is_configuration : bool_constant<false> {};
+
+template<uint16_t W, uint16_t S, uint16_t I>
+struct is_configuration<ArithmeticConfiguration<W,S,I> > : bool_constant<true> {};
+
+template<typename T>
+NBL_CONSTEXPR bool is_configuration_v = is_configuration<T>::value;
+
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 461b685c99..1043decd73 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -43,9 +43,9 @@ struct reduce<Config, BinOp, 1, device_capabilities>
 
         subgroup2::reduction<params_t> reduction;
         vector_t value;
-        dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.get(workgroup::SubgroupContiguousIndex(), value);
         value = reduction(value);
-        dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with top line?
+        dataAccessor.set(workgroup::SubgroupContiguousIndex(), value);
     }
 };
 
@@ -63,7 +63,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
 
         vector_t value;
-        dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.get(workgroup::SubgroupContiguousIndex(), value);
         if (Exclusive)
         {
             subgroup2::exclusive_scan<params_t> excl_scan;
@@ -74,7 +74,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
             subgroup2::inclusive_scan<params_t> incl_scan;
             value = incl_scan(value);
         }
-        dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
+        dataAccessor.set(workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
     }
 };
 
@@ -101,7 +101,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = reduction0(scan_local[idx]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
@@ -131,7 +131,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         {
             scalar_t reduce_val;
             scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val);
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
+            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }
 };
@@ -158,7 +158,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
@@ -204,7 +204,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     scan_local[idx][i] = binop(left, scan_local[idx][i]);
             }
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
         }
     }
 };
@@ -234,7 +234,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = reduction0(scan_local[idx]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
@@ -281,7 +281,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         {
             scalar_t reduce_val;
             scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val);
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
+            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }
 };
@@ -310,7 +310,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
@@ -384,7 +384,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     scan_local[idx][i] = binop(left, scan_local[idx][i]);
             }
-            dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
         }
     }
 };

From 8a2ebe36f3e1ede2dec4658f4e7130fac7886c24 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 15 May 2025 13:51:23 +0200
Subject: [PATCH 121/346] correct strategy for boost's .gitmodules mini tool
 which will *not* lead to issues with non-existing references, update
 3rdparty/boost/superproject submodule

---
 3rdparty/boost/CMakeLists.txt | 51 +++++++++++++++--------------------
 3rdparty/boost/superproject   |  2 +-
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt
index 36e596cbf6..194ad3c35c 100644
--- a/3rdparty/boost/CMakeLists.txt
+++ b/3rdparty/boost/CMakeLists.txt
@@ -56,13 +56,11 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
 	
 	set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}")
 	
-	if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}")
-		macro(NBL_BOOST_EXECUTE)
-			execute_process(COMMAND ${ARGV}
-				WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject"
-			)
-		endmacro()
+	macro(NBL_BOOST_EXECUTE)
+		execute_process(COMMAND ${ARGV} WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject")
+	endmacro()
 
+	if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}")
 		NBL_BOOST_EXECUTE(cmd /C bootstrap.bat)
 		NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build)
 		NBL_BOOST_EXECUTE("${CMAKE_COMMAND}" -E copy "./dist/bin/${NBL_BOOSTDEP_EXE}" "${NBL_BOOSTDEP_EXE_FILEPATH}")
@@ -70,7 +68,7 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
 		NBL_BOOST_EXECUTE(git reset --hard)
 	endif()
 
-	execute_process(COMMAND "${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave
+	NBL_BOOST_EXECUTE("${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave
 		OUTPUT_VARIABLE NBL_OUTPUT_VAR
 	)
 
@@ -83,29 +81,22 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
 	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)")
 	string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}")
 
-	# we override boost's .gitmodules to pick only those modules we really use (reported by boost's dep executable)
-	# boost hosts now like 200 repositories, some of them are really big however atm we reference around 60
-	set(BOOST_SUBMODULE_TEMPLATE 
-[=[
-
-[submodule "@NAME@"]
-	path = libs/@NAME@
-	url = ../@FLATTEN_NAME@.git
-	fetchRecurseSubmodules = on-demand
-	branch = .
-]=]
-	)
+	# NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*, use when updating boost to more recent version
+	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})")
 
-	unset(BOOST_GITMODULES)
-	foreach(NAME ${NBL_BOOST_LIBS}) 
-		string(REPLACE "/" "_" FLATTEN_NAME "${NAME}")
-		string(CONFIGURE "${BOOST_SUBMODULE_TEMPLATE}" TEMPLATE)
-		string(APPEND BOOST_GITMODULES "${TEMPLATE}")
-	endforeach()
+	NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE)
 
-	# NOTE: this you commit to version control
-	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})")
+	string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}")
 
-	# and this one too + you update boost submodule pointer with the update!
-	file(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/.gitmodules" "${BOOST_GITMODULES}")
-endif()
+	message(STATUS "Updating boost .gitmodules")
+	foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST})
+		string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}")
+		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${CMAKE_MATCH_1}.update none) # fallback, ignore all
+	endforeach()
+
+	foreach(NAME ${NBL_BOOST_LIBS})
+		string(REPLACE "/" "_" SUBMODULE "${NAME}")
+		message(STATUS "BOOST SUBMODULE = ${SUBMODULE}")
+		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout) # pick only those reported by the module we use
+	endforeach()
+endif()
\ No newline at end of file
diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject
index e1a703f795..dcc3e1ade0 160000
--- a/3rdparty/boost/superproject
+++ b/3rdparty/boost/superproject
@@ -1 +1 @@
-Subproject commit e1a703f7956264e463329d49ab05100bdc34e219
+Subproject commit dcc3e1ade0ae8e7ea0eadc2d951efb1e53450bff

From 892595c0263ed70e71aaa948cef6fe2370c44ab5 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 15 May 2025 14:36:53 +0200
Subject: [PATCH 122/346] BLAS and TLAS build code reuse and unification

---
 include/nbl/video/IGPUAccelerationStructure.h |   2 +-
 src/nbl/video/utilities/CAssetConverter.cpp   | 844 ++++++++++--------
 2 files changed, 486 insertions(+), 360 deletions(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 60c6add5fb..af541bdccb 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -177,7 +177,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 		inline bool usesMotion() const override {return m_params.flags.hasFlags(SCreationParams::FLAGS::MOTION_BIT);}
 
 		// read the comments in the .hlsl file, AABB builds ignore certain fields
-		using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo;
+		using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; // TODO: rename to GeometryRangeInfo, and make `BuildRangeInfo = const GeometryRangeInfo*`
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*;
 		using MaxInputCounts = const uint32_t* const;
 
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index bc9fac01c0..0167a96a43 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2852,7 +2852,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						}
 						else
 						{
-							const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
+							const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
 							// the code here is not pretty, but DRY-ing is of this is for later
 							if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 							{
@@ -2862,56 +2862,59 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
 								}
 								else
 								{
 									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
 									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
 								}
 								// TODO: check if the strides need to be aligned to 4 bytes for AABBs
 								for (const auto& geom : geoms)
-								if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
+								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
 									incrementBuildSize(aabbCount*geom.stride,alignof(float));
 							}
 							else
 							{
-								core::map<uint32_t,size_t> allocationsPerStride;
 								const auto geoms = as->getTriangleGeometries();
 								if (patch.hostBuild)
 								{
 									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
 									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
 								}
 								else
 								{
 									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
 										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
 									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
+									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
 								}
 								for (const auto& geom : geoms)
-								if (const auto triCount=*(pMaxPrimitiveCounts++); triCount)
+								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
+									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									if (geom.hasTransform())
+										size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
+									auto alignment = 0u;
 									switch (geom.indexType)
 									{
 										case E_INDEX_TYPE::EIT_16BIT:
-											allocationsPerStride[sizeof(uint16_t)] += triCount*3;
+											alignment = alignof(uint16_t);
 											break;
 										case E_INDEX_TYPE::EIT_32BIT:
-											allocationsPerStride[sizeof(uint32_t)] += triCount*3;
+											alignment = alignof(uint32_t);
 											break;
 										default:
 											break;
 									}
-									allocationsPerStride[geom.vertexStride] += (geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									if (alignment)
+										size = core::alignUp(size,alignment)+triCount*3*alignment;
+									incrementBuildSize(size,hlsl::max(alignment,geom.vertexStride));
 								}
-								for (const auto& entry : allocationsPerStride)
-									incrementBuildSize(entry.first*entry.second,entry.first);
 							}
 						}
 					}
@@ -4617,226 +4620,169 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			auto& tlasesToBuild = reservations.m_tlasConversions[0];
 			const auto blasCount = blasesToBuild.size();
 			const auto tlasCount = tlasesToBuild.size();
-			const auto maxASCount = hlsl::max(tlasCount,blasCount);
 			ownershipTransfers.reserve(blasCount+tlasCount);
 
-			auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
-			core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
-			const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible();
-			if (manualFlush) // BLAS builds do max 3 writes each TLAS builds do max 2 writes each
-				flushRanges.reserve(hlsl::max<uint32_t>(blasCount*3,tlasCount*2));
 
 			// Right now we build all BLAS first, then all TLAS
 			// (didn't fancy horrible concurrency managment taking compactions into account)
 			auto queryPool = device->createQueryPool({.queryCount=hlsl::max<uint32_t>(blasCount,tlasCount),.queryType=IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE});
 			
-			const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = {
-				// the last use of the source BLAS could have been a build or a compaction
-				.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-				.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-				.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
-				.dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT
-			};
-			// lambdas!
-			auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool
-			{
-				if (deviceASBuildScratchPtr)
-				{
-					callback(deviceASBuildScratchPtr+offset,0ull,size);
-					if (manualFlush)
-						flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
-					return true;
-				}
-				else if (const SBufferRange<IGPUBuffer> range={.offset=offset,.size=size,.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback))
-					return true;
-				else
-					return false;
-			};
-			//
-			using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
-			using addr_t = typename scratch_allocator_t::size_type;
-			core::vector<addr_t> scratchOffsets;
-			scratchOffsets.reserve(maxASCount);
-			core::vector<addr_t> scratchSizes;
-			scratchSizes.reserve(maxASCount);
-			auto recordBuildCommandsBase = [&](auto& buildInfos, auto& rangeInfos)->void
-			{
-				if (buildInfos.empty())
-					return;
-				// Lets analyze sync cases:
-				// - Mapped Host write = no barrier, flush & optional submit sufficient
-				// - Single Queue = Global Memory Barrier
-				// - Two distinct Queues = no barrier, semaphore signal-wait is sufficient
-				// - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer !
-				bool success = !uniQueue || !deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!");
-				//
-				success = success && computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data());
-				if (success)
-				{
-					submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-					// queue up a deferred allocation
-					params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data(),params.compute->getFutureScratchSemaphore());
-				}
-				else
-				{
-					// release right away
-					params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data());
-					for (const auto& info : buildInfos)
-					{
-						const auto stagingFound = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
-						smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
-						markFailure("AS Build Command Recording",&dummy,&stagingFound->second);
-					}
-				}
-				scratchOffsets.clear();
-				scratchSizes.clear();
-				buildInfos.clear();
-				rangeInfos.clear();
-			};
-
-			// Not messing around with listing AS backing buffers individually, ergonomics of that are null 
-			const asset::SMemoryBarrier readASInASCompactBarrier = {
-				.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
-				.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
-				.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
-				.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
-			};
-
-			// compacted BLASes need to be substituted in cache and TLAS Build Inputs
-			using compacted_blas_map_t = core::unordered_map<const IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>>;
+			// leftover for TLAS builds
+			using compacted_blas_map_t = unordered_map<const IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>>;
 			compacted_blas_map_t compactedBLASMap;
-			// Device BLAS builds
-			if (blasCount)
+			bool failedBLASBarrier = false;
+			// returns a map of compacted Acceleration Structures
+			auto buildAndCompactASes = [&]<typename AccelerationStructure>(auto& asesToBuild)->unordered_map<const AccelerationStructure*,smart_refctd_ptr<AccelerationStructure>>
 			{
-				core::vector<const IGPUAccelerationStructure*> compactions;
-				// build
-				{
-					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes START");
-					computeCmdBuf->cmdbuf->endDebugMarker();
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-			constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-
-			core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> buildInfos; buildInfos.reserve(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> rangeInfo; rangeInfo.reserve(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles;
-			core::vector<IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> aabbs;
-			{
-				size_t totalTriGeoCount = 0;
-				size_t totalAABBGeoCount = 0;
-				for (auto& item : blasToBuild)
-				{
-					const size_t geoCount = item.canonical->getGeometryCount();
-					if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
-						totalAABBGeoCount += geoCount;
-					else
-						totalTriGeoCount += geoCount;
-				}
-				triangles.reserve(totalTriGeoCount);
-				triangles.reserve(totalAABBGeoCount);
-			}
-			for (auto& item : blasToBuild)
-			{
-				auto* as = item.gpuObj;
-				auto pFound = &findInStaging.template operator()<ICPUBottomLevelAccelerationStructure>(as)->second;
-				if (item.asBuildParams.host)
-				{
-					auto dOp = device->createDeferredOperation();
-					//
-					if (!device->buildAccelerationStructure(dOp.get(),info,range))
-					{
-						markFailure("BLAS Build Command Recording",&item.canonical,pFound);
-						continue;
-					}
-				}
-				else
-				{
-					auto& buildInfo = buildInfo.emplace_back({
-						.buildFlags  = item.buildFlags,
-						.geometryCount = item.canonical->getGeometryCount(),
-						// this is not an update
-						.srcAS = nullptr,
-						.dstAS = as.get()
-					});
-					if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
-						buildInfo.aabbs = nullptr;
-					else
-						buildInfo.triangles = nullptr;
-					computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo);
-				}
-			}
-#endif
-					if (!compactions.empty())
-					{
-						// submit cause host needs to read the queries
-						drainCompute();
-					}
-					// want to launch the BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch so more is available for TLAS builds
-					else if (tlasCount)
-						drainCompute();
-					blasesToBuild.clear();
-					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes END");
-					computeCmdBuf->cmdbuf->endDebugMarker();
-				}
-				// compact
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes START");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-				{
-					// the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP
-//compactedBLASMap[as] = compacted;
-				}
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes END");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-			}
+				const auto asCount = asesToBuild.size();
+				if (asCount==0)
+					return {};
+				
+				constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+				using CPUAccelerationStructure = std::conditional_t<IsTLAS,ICPUTopLevelAccelerationStructure,ICPUBottomLevelAccelerationStructure>;
 
-			// Device TLAS builds
-			if (tlasCount)
-			{
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build TLASes START");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-				// A single pipeline barrier to ensure BLASes build before TLASes is needed
-				const asset::SMemoryBarrier readBLASInTLASBuildBarrier = {
-					// the last use of the source BLAS could have been a build or a compaction
-					.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
-					.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
-					.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
-					.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
-				};
-				// either we built no BLASes (remember we could retrieve already built ones from cache) or we barrier for the previous compactions or builds
-				const bool failedBLASBarrier = blasCount && !pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!");
-				// TLAS compactions to do later
 				core::vector<const IGPUAccelerationStructure*> compactions;
 				// 0xffFFffFFu when not releasing ownership, otherwise index into `ownershipTransfers` where the ownership release for the old buffer was
 				core::vector<uint32_t> compactedOwnershipReleaseIndices;
-				compactions.reserve(tlasCount);
-				compactedOwnershipReleaseIndices.reserve(tlasCount);
+				compactions.reserve(asCount);
+				compactedOwnershipReleaseIndices.reserve(asCount);
 				// build
 				{
+					auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
+					core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
+					const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible();
+					if (manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway
+						flushRanges.reserve(asCount*2);
+					// lambdas!
+					auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool
+					{
+						if (deviceASBuildScratchPtr)
+						{
+							callback(deviceASBuildScratchPtr+offset,0ull,size);
+							if (manualFlush)
+								flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
+							return true;
+						}
+						else if (const SBufferRange<IGPUBuffer> range={.offset=offset,.size=size,.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback))
+							return true;
+						else
+							return false;
+					};
 					//
-					core::vector<IGPUTopLevelAccelerationStructure::DeviceBuildInfo> buildInfos;
-					buildInfos.reserve(tlasCount);
-					core::vector<IGPUTopLevelAccelerationStructure::BuildRangeInfo> rangeInfos;
-					rangeInfos.reserve(tlasCount);
+					core::vector<typename AccelerationStructure::DeviceBuildInfo> buildInfos;
+					buildInfos.reserve(asCount);
+					using build_range_info_t = std::conditional_t<IsTLAS,typename AccelerationStructure::BuildRangeInfo,const typename AccelerationStructure::BuildRangeInfo*>;
+					core::vector<build_range_info_t> rangeInfos;
+					rangeInfos.reserve(asCount);
+					using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
+					using addr_t = typename scratch_allocator_t::size_type;
+					core::vector<addr_t> allocOffsets;
+					allocOffsets.reserve(asCount);
+					core::vector<addr_t> allocSizes;
+					allocSizes.reserve(asCount);
+					// BLAS and TLAS specific things
+					core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> geometryRangeInfo;
+					core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles;
+					core::vector<IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> aabbs;
 					core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> trackedBLASes;
-					trackedBLASes.reserve(maxASCount);
+					if constexpr (IsTLAS)
+						trackedBLASes.reserve(asCount);
+					else // would have to count total geometries in BLASes to initialize properly, and we probably don't want to over-reserve
+					{
+						geometryRangeInfo.reserve(asCount);
+						triangles.reserve(asCount);
+						aabbs.reserve(asCount);
+					}
+					//
+					core::vector<addr_t> alignments;
+					alignments.reserve(asCount*2);
+					constexpr auto GeometryIsAABBFlag = IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
 					auto recordBuildCommands = [&]()->void
 					{
-						// rewrite the trackedBLASes pointers
-						for (auto& info : buildInfos)
+						bool success = !buildInfos.empty();
+						// Lets analyze sync cases:
+						// - Mapped Host write = no barrier, flush & optional submit sufficient
+						// - Single Queue = Global Memory Barrier
+						// - Two distinct Queues = no barrier, semaphore signal-wait is sufficient
+						// - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer !
+						if (success)
+						{
+							const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = {
+								// the last use of the source BLAS could have been a build or a compaction
+								.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+								.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+								.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
+								.dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT
+							};
+							success = !uniQueue || deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!");
+						}
+						//
+						constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+						if (success)
+						{
+							// rewrite the based pointers
+							if constexpr (IsTLAS)
+							for (auto& info : buildInfos)
+							{
+								const auto offset = info.trackedBLASes.data();
+								const auto correctPtr = trackedBLASes.data()+reinterpret_cast<const size_t&>(offset);
+								info.trackedBLASes = {reinterpret_cast<const IGPUBottomLevelAccelerationStructure** const&>(correctPtr),info.trackedBLASes.size()};
+							}
+							else
+							{
+								for (auto& info : buildInfos)
+								{
+									if (info.buildFlags.hasFlags(GeometryIsAABBFlag))
+										info.aabbs = aabbs.data()+reinterpret_cast<const size_t&>(info.aabbs);
+									else
+										info.triangles = triangles.data()+reinterpret_cast<const size_t&>(info.triangles);
+								}
+								for (auto& rangeInfo : rangeInfos)
+									rangeInfo = geometryRangeInfo.data()+reinterpret_cast<const size_t&>(rangeInfo);
+							}
+							success = computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data());
+						}
+						// account for the in-progress allocation (we may be called from an overflow submit)
+						const auto oldAllocCount = allocOffsets.size()-alignments.size();
+						if (success)
+						{
+							submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+							// queue up a deferred allocation
+							params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore());
+						}
+						else
+						{
+							// release right away
+							params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data());
+							for (const auto& info : buildInfos)
+							{
+								const auto stagingFound = findInStaging.template operator()<CPUAccelerationStructure>(info.dstAS);
+								smart_refctd_ptr<const CPUAccelerationStructure> dummy; // already null at this point
+								markFailure("AS Build Command Recording",&dummy,&stagingFound->second);
+							}
+						}
+						allocOffsets.erase(allocOffsets.begin(),allocOffsets.begin()+oldAllocCount);
+						allocSizes.erase(allocSizes.begin(),allocSizes.begin()+oldAllocCount);
+						buildInfos.clear();
+						rangeInfos.clear();
+						if constexpr (IsTLAS)
+							trackedBLASes.clear();
+						else
 						{
-							const auto offset = info.trackedBLASes.data();
-							const auto correctPtr = trackedBLASes.data()+reinterpret_cast<const size_t&>(offset);
-							info.trackedBLASes = {reinterpret_cast<const IGPUBottomLevelAccelerationStructure** const&>(correctPtr),info.trackedBLASes.size()};
+							geometryRangeInfo.clear();
+							triangles.clear();
+							aabbs.clear();
 						}
-						recordBuildCommandsBase(buildInfos,rangeInfos);
-						trackedBLASes.clear();
 					};
-					//
+
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures START");
+					computeCmdBuf->cmdbuf->endDebugMarker();
 					const auto& limits = physDev->getLimits();
-					for (auto& tlasToBuild : tlasesToBuild)
+					for (auto& asToBuild : asesToBuild)
 					{
-						auto& canonical = tlasToBuild.second.canonical;
-						const auto as = tlasToBuild.first;
-						const auto pFound = &findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(as)->second;
+						auto& canonical = asToBuild.second.canonical;
+						const auto as = asToBuild.first;
+						const auto pFound = &findInStaging.template operator()<CPUAccelerationStructure>(as)->second;
 						const auto& backingRange = as->getCreationParams().bufferRange;
 						// checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters
 						const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,pFound->cacheKey.value),computeFamily);
@@ -4845,79 +4791,137 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							markFailure("invalid Final Queue Family given by user callback",&canonical,pFound);
 							continue;
 						}
-						const auto instances = canonical->getInstances();
-						const auto instanceCount = static_cast<uint32_t>(instances.size());
-						const auto& instanceMap = tlasToBuild.second.instanceMap;
-						size_t instanceDataSize = 0;
-						// gather total input size and check dependants exist
-						bool dependsOnBLASBuilds = false;
-						for (const auto& instance : instances)
-						{
-							auto found = instanceMap.find(instance.getBase().blas.get());
-							assert(instanceMap.end()!=found);
-							const auto depInfo = missingDependent.template operator()<ICPUBottomLevelAccelerationStructure>(found->second.get());
-							if (depInfo)
+						// clean up the allocation if we fail to make it to the end of loop for whatever reason
+						alignments.clear();
+						auto allocCount = 0;
+						auto deallocSrc = core::makeRAIIExiter([&params,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void
 							{
-								instanceDataSize = 0;
-								break;
+								const auto beginIx = allocSizes.size()-alignments.size();
+								// if got to end of loop queue up the release of memory, otherwise release right away
+								if (allocCount)
+									params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx);
+								allocOffsets.resize(beginIx);
+								allocSizes.resize(beginIx);
+								alignments.clear();
 							}
-							if (depInfo.wasInStaging)
-								dependsOnBLASBuilds = true;
-							instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
-						}
-						// problem with building some Dependent BLASes
-						if (failedBLASBarrier && dependsOnBLASBuilds)
+						);
+						allocSizes.push_back(asToBuild.second.scratchSize);
+						alignments.push_back(limits.minAccelerationStructureScratchOffsetAlignment);
+						const bitflag<typename AccelerationStructure::BUILD_FLAGS> buildFlags = asToBuild.second.getBuildFlags();
+						if constexpr (IsTLAS)
 						{
-							markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound);
-							continue;
+							const auto instances = canonical->getInstances();
+							// gather total input size and check dependants exist
+							size_t instanceDataSize = 0;
+							bool dependsOnBLASBuilds = false;
+							const auto& instanceMap = asToBuild.second.instanceMap;
+							for (const auto& instance : instances)
+							{
+								auto found = instanceMap.find(instance.getBase().blas.get());
+								assert(instanceMap.end()!=found);
+								const auto depInfo = missingDependent.template operator()<ICPUBottomLevelAccelerationStructure>(found->second.get());
+								if (depInfo)
+								{
+									instanceDataSize = 0;
+									break;
+								}
+								if (depInfo.wasInStaging)
+									dependsOnBLASBuilds = true;
+								instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
+							}
+							// problem with building some Dependent BLASes
+							if (failedBLASBarrier && dependsOnBLASBuilds)
+							{
+								markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound);
+								continue;
+							}
+							// problem with finding the dependents (BLASes)
+							if (instanceDataSize==0)
+							{
+								markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound);
+								continue;
+							}
+							allocSizes.push_back(instanceDataSize);
+							alignments.push_back(16);
+							if (as->usesMotion())
+							{
+								allocSizes.push_back(sizeof(void*)*instances.size());
+								alignments.push_back(alignof(uint64_t));
+							}
 						}
-						// problem with finding the dependents (BLASes)
-						if (instanceDataSize==0)
+						else
 						{
-							markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound);
-							continue;
-						}
-						// allocate scratch and build inputs
-						constexpr uint32_t MaxAllocCount = 3;
-						addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
-						const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
-						const auto AllocCount = as->usesMotion() ? 3:2;
-						// clean up the allocation if we fail to make it to the end of loop for whatever reason
-						bool abortAllocation = true;
-						auto deallocSrc = core::makeRAIIExiter([&params,&scratchOffsets,&scratchSizes,AllocCount,&offsets,&sizes,&abortAllocation]()->void
+							const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data();
+							if (buildFlags.hasFlags(GeometryIsAABBFlag))
 							{
-								// if got to end of loop queue up the release of memory, otherwise release right away
-								if (abortAllocation)
-									params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0]);
-								else
-								for (auto i=0; i<AllocCount; i++)
+								for (const auto& geom : canonical->getAABBGeometries())
+								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
 								{
-									scratchOffsets.push_back(offsets[i]);
-									scratchSizes.push_back(sizes[i]);
+									allocSizes.push_back(aabbCount*geom.stride);
+									alignments.push_back(alignof(float));
 								}
 							}
-						);
-						// allocate out scratch or submit overflow
-						{
-							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)};
-							// if fail then flush and keep trying till space is made
-							for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
-							if (t==1) // don't flush right away cause allocator not defragmented yet
+							else
 							{
-								recordBuildCommands();
-								// if writing to scratch directly, flush the writes
-								if (!flushRanges.empty())
+								for (const auto& geom : canonical->getTriangleGeometries())
+								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
-									device->flushMappedMemoryRanges(flushRanges);
-									flushRanges.clear();
+									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									if (geom.hasTransform())
+										size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
+									auto alignment = 0u;
+									switch (geom.indexType)
+									{
+										case E_INDEX_TYPE::EIT_16BIT:
+											alignment = alignof(uint16_t);
+											break;
+										case E_INDEX_TYPE::EIT_32BIT:
+											alignment = alignof(uint32_t);
+											break;
+										default:
+											break;
+									}
+									if (alignment)
+										size = core::alignUp(size,alignment)+triCount*3*alignment;
+									allocSizes.push_back(size);
+									alignments.push_back(hlsl::max(alignment,geom.vertexStride));
 								}
-								drainCompute();
 							}
 						}
-						// stream the instance/geometry input in
+						allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value);
+						// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
+						auto* const offsets = allocOffsets.data()+allocOffsets.size()-allocCount;
+						const auto* const sizes = allocSizes.data()+allocSizes.size()-allocCount;
+						for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(allocCount,offsets,sizes,alignments.data())!=0; t++)
+						if (t==1) // don't flush right away cause allocator not defragmented yet
+						{
+							recordBuildCommands();
+							// if writing to scratch directly, flush the writes
+							if (!flushRanges.empty())
+							{
+								device->flushMappedMemoryRanges(flushRanges);
+								flushRanges.clear();
+							}
+							drainCompute();
+						}
+						// now upon a failure, our allocations will need to be deallocated
+						allocCount = alignments.size();
+						// prepare build infos
+						typename AccelerationStructure::DeviceBuildInfo buildInfo;
+						buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
+						buildInfo.buildFlags = buildFlags;
+						buildInfo.dstAS = as;
+						// abortion backup
+						bool success = true;
+						const auto geometryRangeInfoOffset = geometryRangeInfo.size();
+						const auto trianglesOffset = triangles.size();
+						const auto aabbsOffset = aabbs.size();
 						const size_t trackedBLASesOffset = trackedBLASes.size();
+						if constexpr (IsTLAS)
 						{
-							bool success = true;
+							const auto instances = canonical->getInstances();
+							const auto instanceCount = static_cast<uint32_t>(instances.size());
+							// stream the instance/geometry input in
 							{
 								struct FillInstances : IUtilities::IUpstreamingDataProducer
 								{
@@ -4955,11 +4959,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								FillInstances fillInstances;
 								fillInstances.compactedBLASMap = &compactedBLASMap;
 								fillInstances.trackedBLASes = &trackedBLASes;
-								fillInstances.instanceMap = &tlasToBuild.second.instanceMap;
+								fillInstances.instanceMap = &asToBuild.second.instanceMap;
 								fillInstances.instances = instances;
 								success = streamDataToScratch(offsets[1],sizes[1],fillInstances);
 								// provoke refcounting bugs right away
-								tlasToBuild.second.instanceMap.clear();
+								asToBuild.second.instanceMap.clear();
 							}
 							if (success && as->usesMotion())
 							{
@@ -4989,33 +4993,107 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								fillInstancePointers.instanceAddress = scratchBuffer->getDeviceAddress()+offsets[1];
 								success = streamDataToScratch(offsets[2],sizes[2],fillInstancePointers);
 							}
-							// current recording buffer may have changed
-							xferCmdBuf = params.transfer->getCommandBufferForRecording();
-							if (!success)
+							//
+							buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion();
+							// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
+							buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
+							// be based cause vectors can grow
+							using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
+							buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset};
+							// no special extra byte offset into the instance buffer
+							rangeInfos.emplace_back(instanceCount,0u);
+						}
+						else
+						{
+							buildInfo.geometryCount = canonical->getGeometryCount();
+							const auto* offsetIt = offsets+1;
+							const auto primitiveCounts = canonical->getGeometryPrimitiveCounts();
+							for (const auto count : primitiveCounts)
+								geometryRangeInfo.push_back({
+									.primitiveCount = count,
+									.primitiveByteOffset = 0,
+									.firstVertex = 0,
+									.transformByteOffset = 0
+								});	
+							const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data();
+							if (buildFlags.hasFlags(GeometryIsAABBFlag))
 							{
-								trackedBLASes.resize(trackedBLASesOffset);
-								markFailure("Uploading Instance Data for TLAS build failed",&canonical,pFound);
-								continue;
+								for (const auto& geom : canonical->getAABBGeometries())
+								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
+								{
+									auto offset = *(offsetIt++);
+// TODO: stream in the data
+									aabbs.push_back({
+										.data = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)},
+										.stride = geom.stride,
+										.geometryFlags = geom.geometryFlags
+									});
+								}
+								buildInfo.aabbs = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>* const&>(aabbsOffset);
 							}
-							// let go of canonical asset (may free RAM)
-							canonical = nullptr;
+							else
+							{
+								for (const auto& geom : canonical->getTriangleGeometries())
+								if (const auto triCount=*(pPrimitiveCounts++); triCount)
+								{
+									auto& outGeom = triangles.emplace_back();
+									auto offset = *(offsetIt++);
+// TODO: stream in the data
+									outGeom.vertexData[0] = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+									offset += geom.vertexStride*geom.maxVertex;
+									if (geom.vertexData[1])
+									{
+										outGeom.vertexData[1] = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+										offset += geom.vertexStride*geom.maxVertex;
+									}
+									if (geom.hasTransform())
+									{
+										offset = core::alignUp(offset,alignof(float));
+										outGeom.transform = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+										offset += sizeof(hlsl::float32_t3x4);
+									}
+									switch (geom.indexType)
+									{
+										case E_INDEX_TYPE::EIT_16BIT: [[fallthrough]];
+										case E_INDEX_TYPE::EIT_32BIT:
+										{
+											const auto alignment = geom.indexType==E_INDEX_TYPE::EIT_16BIT ? alignof(uint16_t):alignof(uint32_t);
+											offset = core::alignUp(offset,alignment);
+											outGeom.indexData = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+											break;
+										}
+										default:
+											break;
+									}
+									outGeom.maxVertex = geom.maxVertex;
+									outGeom.vertexStride = geom.vertexStride;
+									outGeom.vertexFormat = geom.vertexFormat;
+									outGeom.indexType = geom.indexType;
+									outGeom.geometryFlags = geom.geometryFlags;
+								}
+								buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>* const&>(trianglesOffset);
+							}
+							rangeInfos.push_back(reinterpret_cast<const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const&>(geometryRangeInfoOffset));
+success = false;
 						}
-						// prepare build infos
-						auto& buildInfo = buildInfos.emplace_back();
-						buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
-						buildInfo.buildFlags = tlasToBuild.second.getBuildFlags();
-						buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion();
-						buildInfo.dstAS = as;
-						// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
-						buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
-						// be based cause vectors can grow
-						using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
-						buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset};
-						// no special extra byte offset into the instance buffer
-						rangeInfos.emplace_back(instanceCount,0u);
-						abortAllocation = false;
+						// current recording buffer may have changed
+						xferCmdBuf = params.transfer->getCommandBufferForRecording();
+						if (!success)
+						{
+							rangeInfos.resize(buildInfos.size());
+							geometryRangeInfo.resize(geometryRangeInfoOffset);
+							triangles.resize(trianglesOffset);
+							aabbs.resize(aabbsOffset);
+							trackedBLASes.resize(trackedBLASesOffset);
+							markFailure("Uploading Input Data for Accleration Structure build failed",&canonical,pFound);
+							continue;
+						}
+						buildInfos.emplace_back(std::move(buildInfo));
+						allocCount = 0;
+						// let go of canonical asset (may free RAM)
+						canonical = nullptr;
 						//
-						const bool willCompact = tlasToBuild.second.compact;
+						const bool willCompact = asToBuild.second.compact;
 						if (willCompact)
 							compactions.push_back(as);
 						// enqueue ownership release if necessary
@@ -5041,130 +5119,178 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					}
 					// finish the last batch
 					recordBuildCommands();
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures END");
+					computeCmdBuf->cmdbuf->endDebugMarker();
+					// provoke refcounting bugs
+					asesToBuild.clear();
+					// flush all ranged before potential submit
 					if (!flushRanges.empty())
 					{
 						device->flushMappedMemoryRanges(flushRanges);
 						flushRanges.clear();
 					}
-					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END");
-					computeCmdBuf->cmdbuf->endDebugMarker();
 				}
-				tlasesToBuild.clear();
-				// compact
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes START");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-				// compact needs to wait for Build then record queries
+
+				// Not messing around with listing AS backing buffers individually, ergonomics of that are null 
+				const asset::SMemoryBarrier readASInASCompactBarrier = {
+					.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
+					.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
+					// TODO: do queries or query retrieval have a stage?
+					.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
+					.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
+				};
 				if (!compactions.empty() && 
 					pipelineBarrier(computeCmdBuf,{.memBarriers={&readASInASCompactBarrier,1}},"Failed to sync Acceleration Structure builds with compactions!") &&
 					computeCmdBuf->cmdbuf->resetQueryPool(queryPool.get(),0,compactions.size()) &&
 					computeCmdBuf->cmdbuf->writeAccelerationStructureProperties(compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get(),0)
 				)
 				{
-					// submit cause host needs to read the queries
+					// clean AS builds, pipeline barrier, query reset and writes need to get executed before we start waiting on the results
 					drainCompute();
 					// get queries
 					core::vector<size_t> sizes(compactions.size());
-					if (device->getQueryPoolResults(
-						queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),
-						bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT
-					))
+					if (!device->getQueryPoolResults(queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT))
 					{
-						auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void
-						{
-							logger.log("Failed to %s for \"%s\"", system::ILogger::ELL_ERROR,as->getObjectDebugName());
-						};
-						// TODO: normally we'd iteratively record as many compactions as we can, but we don't have a mechanism to release already compacted TLASes, we'd need to defer the writing of the TLAS to the Descriptor Set till later
-						// create and allocate backing buffers for compacted TLASes
-						core::vector<asset_cached_t<ICPUBuffer>> backingBuffers(compactions.size());
+						logger.log("Failed to Query %sLevelAccelerationStructure compacted sizes, skipping compaction!",system::ILogger::ELL_ERROR,IsTLAS ? "Top":"Bottom");
+						return {};
+					}
+					//
+					auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void
+					{
+						logger.log("Failed to %s for \"%s\"",system::ILogger::ELL_ERROR,msg,as->getObjectDebugName());
+					};
+					// try to allocate memory for 
+					core::vector<asset_cached_t<ICPUBuffer>> backingBuffers(compactions.size());
+					{
+						MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger);
+						// create
+						for (size_t i=0; i<compactions.size(); i++)
 						{
-							MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger);
-							// create
-							for (size_t i=0; i<compactions.size(); i++)
+							const auto* as = static_cast<const AccelerationStructure*>(compactions[i]);
+							assert(as);
+							// silently skip if not worth it
+							if (!params.confirmCompact(sizes[i],as))
 							{
-								const auto* as = static_cast<const IGPUTopLevelAccelerationStructure*>(compactions[i]);
-								assert(as);
-								// silently skip if not worth it
-								if (!params.confirmCompact(sizes[i],as))
+								logger.log("Compaction not confirmed for \"%s\" would be compacted size is %d, original %d.",system::ILogger::ELL_DEBUG,as->getObjectDebugName(),sizes[i],as->getCreationParams().bufferRange.size);
+								continue;
+							}
+							// create backing buffer and request an allocation for it
+							{
+								const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get();
+								assert(oldBuffer);
+								// This is a Spec limit/rpomise we don't even expose it
+								constexpr size_t MinASBufferAlignment = 256u;
+								using usage_f = IGPUBuffer::E_USAGE_FLAGS;
+								IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage=usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}};
+								// same sharing setup as the previous AS buffer
+								creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount;
+								creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices;
+								auto buf = device->createBuffer(std::move(creationParams));
+								if (!buf)
+								{
+									logFail("create Buffer backing the Compacted Acceleration Structure",as);
 									continue;
-								smart_refctd_ptr<IGPUBuffer> buff;
+								}
+								auto bufReqs = buf->getMemoryReqs();
+								backingBuffers[i].value = std::move(buf);
+								// allocate new memory - definitely don't want to be raytracing from across the PCIE slot
+								if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits()))
 								{
-									const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get();
-									assert(oldBuffer);
-									//
-									constexpr size_t MinASBufferAlignment = 256u;
-									using usage_f = IGPUBuffer::E_USAGE_FLAGS;
-									IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage = usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}};
-									creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount;
-									creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices;
-									auto buf = device->createBuffer(std::move(creationParams));
-									if (!buf)
-									{
-										logFail("create Buffer backing the Compacted Acceleration Structure",as);
-										continue;
-									}
-									// allocate new memory
-									auto bufReqs = buff->getMemoryReqs();
-									// definitely don't want to be raytracing from across the PCIE slot
-									if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits()))
-									{
-										logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as);
-										continue;
-									}
-									backingBuffers[i].value = std::move(buf);
+									logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as);
+									continue;
 								}
 							}
-							// allocate memory for the buffers
-							deferredAllocator.finalize();
 						}
+						// allocate memory for the buffers
+						deferredAllocator.finalize();
+						unordered_map<const AccelerationStructure*,smart_refctd_ptr<AccelerationStructure>> retval;
+						retval.reserve(compactions.size());
 						// recreate Acceleration Structures
 						for (size_t i=0; i<compactions.size(); i++)
 						if (backingBuffers[i])
 						{
-							const auto* as = static_cast<const IGPUTopLevelAccelerationStructure*>(compactions[i]);
+							const auto* srcAS = static_cast<const AccelerationStructure*>(compactions[i]);
 							auto& backingBuffer = backingBuffers[i].value;
 							if (!backingBuffer->getBoundMemory().isValid())
 							{
-								logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",as);
-								continue; // reason to end a batch, see the TODO above
+								logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",srcAS);
+								continue;
+							}
+							smart_refctd_ptr<AccelerationStructure> compactedAS;
+							{
+								typename AccelerationStructure::SCreationParams creationParams = {srcAS->getCreationParams()};
+								creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)};
+								if constexpr (IsTLAS)
+								{
+									creationParams.maxInstanceCount = srcAS->getMaxInstanceCount();
+									compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams));
+								}
+								else
+									compactedAS = device->createBottomLevelAccelerationStructure(std::move(creationParams));
 							}
-							IGPUTopLevelAccelerationStructure::SCreationParams creationParams = {as->getCreationParams()};
-							creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)};
-							creationParams.maxInstanceCount = as->getMaxInstanceCount();
-							auto compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams));
 							if (!compactedAS)
 							{
-								logFail("create the Compacted Acceleration Structure",as);
+								logFail("create the Compacted Acceleration Structure",srcAS);
 								continue;
 							}
 							// set the debug name
 							{
-								std::string debugName = as->getObjectDebugName();
+								std::string debugName = srcAS->getObjectDebugName();
 								debugName += " compacted";
 								compactedAS->setObjectDebugName(debugName.c_str());
 							}
 							// record compaction
-							if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=as,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT}))
+							if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=srcAS,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT}))
 							{
 								logFail("record Acceleration Structure compaction",compactedAS.get());
 								continue;
 							}
-							// modify the ownership release
+							// modify the ownership release to be for the final compacted AS
 							if (const auto ix=compactedOwnershipReleaseIndices[i]; ix<ownershipTransfers.size())
 								ownershipTransfers[ix].range = compactedAS->getCreationParams().bufferRange;
 							// swap out the conversion result
-							const auto foundIx = outputReverseMap.find(as);
+							const auto foundIx = outputReverseMap.find(srcAS);
 							if (foundIx!=outputReverseMap.end())
 							{
-								auto& resultOutput = std::get<SReserveResult::vector_t<ICPUTopLevelAccelerationStructure>>(reservations.m_gpuObjects);
+								auto& resultOutput = std::get<SReserveResult::vector_t<CPUAccelerationStructure>>(reservations.m_gpuObjects);
 								resultOutput[foundIx->second].value = compactedAS;
 							}
 							// insert into compaction map
-							compactedTLASMap[as] = std::move(compactedAS);
+							retval[srcAS] = std::move(compactedAS);
 						}
+						return retval;
 					}
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures START");
+					computeCmdBuf->cmdbuf->endDebugMarker();
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures END");
+					computeCmdBuf->cmdbuf->endDebugMarker();
+				}
+				return {};
+			};
+
+			// compacted BLASes need to be substituted in cache and TLAS Build Inputs
+			compactedBLASMap = buildAndCompactASes.template operator()<IGPUBottomLevelAccelerationStructure>(blasesToBuild);
+			// Device TLAS builds
+			if (tlasCount)
+			{
+				// either we built no BLASes (remember we could retrieve already built ones from cache)
+				if (blasCount)
+				{
+					// Or we barrier for the previous compactions or builds (a single pipeline barrier to ensure BLASes build before TLASes is needed)
+					const asset::SMemoryBarrier readBLASInTLASBuildBarrier = {
+						// the last use of the source BLAS could have been a build or a compaction
+						.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
+						.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
+						.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
+						.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
+					};
+					// submit because we want to launch BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch and more is available for TLAS builds
+					if (pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!"))
+						drainCompute();
+					else
+						failedBLASBarrier = true;
 				}
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END");
-				computeCmdBuf->cmdbuf->endDebugMarker();
+				compactedTLASMap = buildAndCompactASes.template operator()<IGPUTopLevelAccelerationStructure>(tlasesToBuild);
 			}
 
 			// release ownership

From 4b03383578805920cf98f14c4c9bf168f9c99b08 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 15 May 2025 14:39:35 +0200
Subject: [PATCH 123/346] nuke the old asset converter, nothing useful or not
 reimplemented there anymore

---
 include/nbl/video/declarations.h              |   1 -
 .../utilities/IGPUObjectFromAssetConverter.h  | 168 ------------------
 2 files changed, 169 deletions(-)
 delete mode 100644 include/nbl/video/utilities/IGPUObjectFromAssetConverter.h

diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h
index ecec442366..2fdfe28e3c 100644
--- a/include/nbl/video/declarations.h
+++ b/include/nbl/video/declarations.h
@@ -34,7 +34,6 @@
 #include "nbl/video/utilities/CDrawIndirectAllocator.h"
 #include "nbl/video/utilities/CSubpassKiln.h"
 #include "nbl/video/utilities/IUtilities.h"
-#include "nbl/video/utilities/IGPUObjectFromAssetConverter.h"
 #include "nbl/video/utilities/SPhysicalDeviceFilter.h"
 #include "nbl/video/utilities/CSimpleResizeSurface.h"
 #include "nbl/video/utilities/CSmoothResizeSurface.h"
diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
deleted file mode 100644
index b7ffc5d0c1..0000000000
--- a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_
-#define _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_
-
-#include "nbl/core/declarations.h"
-#include "nbl/core/alloc/LinearAddressAllocator.h"
-
-#include "nbl/video/ISemaphore.h"
-#include "nbl/video/ILogicalDevice.h"
-
-#if 0
-            // Convert CPUBuffer Deps to GPUBuffers
-            core::vector<size_t> redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps);
-            auto gpuBufs = getGPUObjectsFromAssets<asset::ICPUBuffer>(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params);
-            _params.waitForCreationToComplete();
-            _params.beginCommandBuffers();
-            size_t bufIter = 0ull;
-
-            // Fill buildGeomInfos partially (to later ge Get AS Size before build command)
-            std::vector<IGPUAccelerationStructure::DeviceBuildGeometryInfo> buildGeomInfos(toCreateAndBuild.size());
-     
-            using GPUGeometry = IGPUAccelerationStructure::Geometry<IGPUAccelerationStructure::DeviceAddressType>;
-            std::vector<GPUGeometry> gpuGeoms;
-            gpuGeoms.reserve(assetCount * MaxGeometryPerBuildInfo);
-
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i];
-            
-                auto cpuBuildInfo = cpuas->getBuildInfo();
-                auto & gpuBuildInfo = buildGeomInfos[i];
-
-                gpuBuildInfo.type = cpuBuildInfo->type;
-                gpuBuildInfo.buildFlags = cpuBuildInfo->buildFlags;
-                gpuBuildInfo.buildMode = cpuBuildInfo->buildMode;
-                assert(cpuBuildInfo->buildMode == asset::IAccelerationStructure::EBM_BUILD);
-
-                // Fill Later:
-                gpuBuildInfo.srcAS = nullptr;
-                gpuBuildInfo.dstAS = nullptr;
-                gpuBuildInfo.scratchAddr = {};
-                
-                auto cpu_geoms = cpuBuildInfo->getGeometries().begin();
-                auto geomsCount = cpuBuildInfo->getGeometries().size();
-                if(geomsCount == 0)
-                {
-                    assert(false);
-                    continue;
-                }
-
-                size_t startGeom = gpuGeoms.size();
-                size_t endGeom = gpuGeoms.size() + geomsCount;
-
-                for(uint32_t g = 0; g < geomsCount; ++g) 
-                {
-                    const auto& cpu_geom = cpu_geoms[g];
-
-                    GPUGeometry gpu_geom = {};
-                    gpu_geom.type = cpu_geom.type;
-                    gpu_geom.flags = cpu_geom.flags;
-
-                    if(cpu_geom.type == asset::IAccelerationStructure::EGT_TRIANGLES)
-                    {
-                        gpu_geom.data.triangles.vertexFormat = cpu_geom.data.triangles.vertexFormat;
-                        gpu_geom.data.triangles.vertexStride = cpu_geom.data.triangles.vertexStride;
-                        gpu_geom.data.triangles.maxVertex = cpu_geom.data.triangles.maxVertex;
-                        gpu_geom.data.triangles.indexType = cpu_geom.data.triangles.indexType;
-
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.triangles.indexData.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.triangles.indexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.indexData.offset;
-                        }
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.triangles.vertexData.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.triangles.vertexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.vertexData.offset;
-                        }
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.triangles.transformData.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.triangles.transformData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.transformData.offset;
-                        }
-                    }
-                    else if(cpu_geom.type == asset::IAccelerationStructure::EGT_AABBS)
-                    {
-                        gpu_geom.data.aabbs.stride = cpu_geom.data.aabbs.stride;
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.aabbs.data.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.aabbs.data.offset = gpubuf->getOffset() + cpu_geom.data.aabbs.data.offset;
-                        }
-                    }
-                    else if(cpu_geom.type == asset::IAccelerationStructure::EGT_INSTANCES)
-                    {
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.instances.data.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.instances.data.offset = gpubuf->getOffset() + cpu_geom.data.instances.data.offset;
-                        }
-                    }
-
-                    gpuGeoms.push_back(gpu_geom);
-                }
-
-                gpuBuildInfo.geometries = core::SRange<GPUGeometry>(gpuGeoms.data() + startGeom, gpuGeoms.data() + endGeom);
-            }
-            
-            // Get SizeInfo for each CPUAS -> Create the AS -> Get Total Scratch Buffer Size 
-            std::vector<IGPUAccelerationStructure::BuildSizes> buildSizes(toCreateAndBuild.size());
-            uint64_t totalScratchBufferSize = 0ull;
-            uint64_t maxScratchBufferSize = 0ull;
-            for (ptrdiff_t i = 0u, toBuildIndex = 0u; i < assetCount; ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = _begin[i];
-                if(cpuas->hasBuildInfo() == false)
-                {
-                    // Only those with buildInfo (index in toCreateAndBuild vector) will get passed
-                    continue;
-                }
-
-                assert(cpuas == toCreateAndBuild[toBuildIndex]);
-                assert(toBuildIndex < toCreateAndBuild.size());
-
-                auto buildRanges = cpuas->getBuildRanges().begin();
-                auto buildRangesCount = cpuas->getBuildRanges().size();
-
-                auto & gpuBuildInfo = buildGeomInfos[toBuildIndex];
-                
-                std::vector<uint32_t> maxPrimCount(buildRangesCount);
-                for(auto b = 0; b < buildRangesCount; b++)
-                  maxPrimCount[b] = buildRanges[b].primitiveCount;
-
-                auto buildSize = _params.device->getAccelerationStructureBuildSizes(gpuBuildInfo, maxPrimCount.data());
-                buildSizes[i] = buildSize;
-
-                auto gpuAS = allocateBufferAndCreateAccelerationStructure(buildSize.accelerationStructureSize, cpuas);
-                res->operator[](i) = gpuAS;
-
-                // complete the buildGeomInfos (now only thing left is to allocate and set scratchAddr.buffer)
-                buildGeomInfos[toBuildIndex].dstAS = gpuAS.get();
-                buildGeomInfos[toBuildIndex].scratchAddr.offset = totalScratchBufferSize;
-
-                totalScratchBufferSize += buildSize.buildScratchSize;
-                core::max(maxScratchBufferSize, buildSize.buildScratchSize); // maxScratchBufferSize has no use now (unless we changed this function to build 1 by 1 instead of batch builds or have some kind of memory limit?)
-                ++toBuildIndex;
-            }
-
-            // Allocate Scratch Buffer
-            IGPUBuffer::SCreationParams gpuScratchBufParams = {};
-            gpuScratchBufParams.size = totalScratchBufferSize;
-            gpuScratchBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; 
-            auto gpuScratchBuf = _params.device->createBuffer(std::move(gpuScratchBufParams));
-            auto mreqs = gpuScratchBuf->getMemoryReqs();
-            mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-            auto gpuScratchBufMem = _params.device->allocate(mreqs, gpuScratchBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                auto & gpuBuildInfo = buildGeomInfos[i];
-                gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf;
-            }
-#endif
-
-#endif

From 0ebdda6eafc89525716f2959c8a5a72cf21f8cd6 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 15 May 2025 14:59:46 +0200
Subject: [PATCH 124/346] proper default initializer for triangle BLAS geometry
 transforms

---
 include/nbl/asset/IAccelerationStructure.h | 59 +++++++++++++---------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index d251dd3077..0efe6781ae 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -92,31 +92,40 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 		template<typename BufferType> requires std::is_base_of_v<IBuffer,BufferType>
 		struct Triangles
 		{
-			using buffer_t = std::remove_const_t<BufferType>;
-			constexpr static inline bool Host = std::is_same_v<buffer_t,ICPUBuffer>;
-			// we make our life easier by not taking pointers to single matrix values
-			using transform_t = std::conditional_t<Host,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
-
-			inline bool hasTransform() const
-			{
-				if constexpr (Host)
-					return !core::isnan(transform[0][0]);
-				else
-					return bool(transform.buffer);
-			}
-
-			// optional, only useful for baking model transforms of multiple meshes into one BLAS
-			transform_t	transform = {};
-			// vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT`
-			asset::SBufferBinding<const buffer_t>	vertexData[2] = {{},{}};
-			asset::SBufferBinding<const buffer_t>	indexData = {};
-			uint32_t								maxVertex = 0u;
-			// type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819
-			uint32_t								vertexStride = sizeof(float);
-			E_FORMAT								vertexFormat = EF_R32G32B32_SFLOAT;
-			E_INDEX_TYPE							indexType = EIT_UNKNOWN;
-			core::bitflag<GEOMETRY_FLAGS>			geometryFlags = GEOMETRY_FLAGS::NONE;
-			// TODO: opacity and displacement micromap buffers and shizz
+			public:
+				using buffer_t = std::remove_const_t<BufferType>;
+				constexpr static inline bool Host = std::is_same_v<buffer_t,ICPUBuffer>;
+				// we make our life easier by not taking pointers to single matrix values
+				using transform_t = std::conditional_t<Host,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
+
+				inline bool hasTransform() const
+				{
+					if constexpr (Host)
+						return !core::isnan(transform[0][0]);
+					else
+						return bool(transform.buffer);
+				}
+
+				// optional, only useful for baking model transforms of multiple meshes into one BLAS
+				transform_t	transform = __transform_initializer();
+				// vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT`
+				asset::SBufferBinding<const buffer_t>	vertexData[2] = {{},{}};
+				asset::SBufferBinding<const buffer_t>	indexData = {};
+				uint32_t								maxVertex = 0u;
+				// type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819
+				uint32_t								vertexStride = sizeof(float);
+				E_FORMAT								vertexFormat = EF_R32G32B32_SFLOAT;
+				E_INDEX_TYPE							indexType = EIT_UNKNOWN;
+				core::bitflag<GEOMETRY_FLAGS>			geometryFlags = GEOMETRY_FLAGS::NONE;
+				// TODO: opacity and displacement micromap buffers and shizz
+
+			private:
+				constexpr static transform_t __transform_initializer()
+				{
+					if constexpr (Host)
+						return hlsl::float32_t3x4(std::numeric_limits<float>::quiet_NaN());
+					return {};
+				}
 		};
 
 		//

From c32846fbf8377d221a74c7b010c81090f7f34f65 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 15 May 2025 15:58:35 +0200
Subject: [PATCH 125/346] Stream the BLAS build inputs, fix a bug and note
 another one that has to get fixed

---
 src/nbl/video/utilities/CAssetConverter.cpp | 41 ++++++++++++++-------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 0167a96a43..285a1dce1d 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2854,6 +2854,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						{
 							const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
 							// the code here is not pretty, but DRY-ing is of this is for later
+// TODO: ILogicalDevice needs code to query build sizes of ICPUBottomLevelAccelerationStructure geometries!
 							if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 							{
 								const auto geoms = as->getAABBGeometries();
@@ -4890,9 +4891,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						}
 						allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value);
 						// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
-						auto* const offsets = allocOffsets.data()+allocOffsets.size()-allocCount;
-						const auto* const sizes = allocSizes.data()+allocSizes.size()-allocCount;
-						for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(allocCount,offsets,sizes,alignments.data())!=0; t++)
+						auto* const offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
+						const auto* const sizes = allocSizes.data()+allocSizes.size()-alignments.size();
+						for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++)
 						if (t==1) // don't flush right away cause allocator not defragmented yet
 						{
 							recordBuildCommands();
@@ -5007,6 +5008,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						{
 							buildInfo.geometryCount = canonical->getGeometryCount();
 							const auto* offsetIt = offsets+1;
+							const auto* sizeIt = sizes+1;
 							const auto primitiveCounts = canonical->getGeometryPrimitiveCounts();
 							for (const auto count : primitiveCounts)
 								geometryRangeInfo.push_back({
@@ -5015,14 +5017,17 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 									.firstVertex = 0,
 									.transformByteOffset = 0
 								});	
-							const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data();
+							const uint32_t* pPrimitiveCounts = primitiveCounts.data();
+							IUtilities::CMemcpyUpstreamingDataProducer memcpyCallback;
 							if (buildFlags.hasFlags(GeometryIsAABBFlag))
 							{
 								for (const auto& geom : canonical->getAABBGeometries())
 								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
 								{
 									auto offset = *(offsetIt++);
-// TODO: stream in the data
+									memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.data.buffer->getPointer())+geom.data.offset;
+									if (!streamDataToScratch(offset,*(sizeIt++),memcpyCallback))
+										break;
 									aabbs.push_back({
 										.data = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)},
 										.stride = geom.stride,
@@ -5038,19 +5043,24 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								{
 									auto& outGeom = triangles.emplace_back();
 									auto offset = *(offsetIt++);
-// TODO: stream in the data
-									outGeom.vertexData[0] = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
-									offset += geom.vertexStride*geom.maxVertex;
-									if (geom.vertexData[1])
+									auto size = geom.vertexStride*geom.maxVertex;
+									for (auto i=0; i<2; i++)
+									if (geom.vertexData[i]) // could assert that it must be true for i==0
 									{
-										outGeom.vertexData[1] = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
-										offset += geom.vertexStride*geom.maxVertex;
+										outGeom.vertexData[i] = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+										memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.vertexData[i].buffer->getPointer())+geom.vertexData[i].offset;
+										if (!streamDataToScratch(offset,size,memcpyCallback))
+											break;
+										offset += size;
 									}
 									if (geom.hasTransform())
 									{
 										offset = core::alignUp(offset,alignof(float));
 										outGeom.transform = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
-										offset += sizeof(hlsl::float32_t3x4);
+										memcpyCallback.data = &geom.transform;
+										if (!streamDataToScratch(offset,sizeof(geom.transform),memcpyCallback))
+											break;
+										offset += sizeof(geom.transform);
 									}
 									switch (geom.indexType)
 									{
@@ -5060,11 +5070,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 											const auto alignment = geom.indexType==E_INDEX_TYPE::EIT_16BIT ? alignof(uint16_t):alignof(uint32_t);
 											offset = core::alignUp(offset,alignment);
 											outGeom.indexData = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+											size = triCount*3*alignment;
+											memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.indexData.buffer->getPointer())+geom.indexData.offset;
+											success = streamDataToScratch(offset,size,memcpyCallback);
 											break;
 										}
 										default:
 											break;
 									}
+									if (!success)
+										break;
 									outGeom.maxVertex = geom.maxVertex;
 									outGeom.vertexStride = geom.vertexStride;
 									outGeom.vertexFormat = geom.vertexFormat;
@@ -5073,8 +5088,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								}
 								buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>* const&>(trianglesOffset);
 							}
+							success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size();
 							rangeInfos.push_back(reinterpret_cast<const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const&>(geometryRangeInfoOffset));
-success = false;
 						}
 						// current recording buffer may have changed
 						xferCmdBuf = params.transfer->getCommandBufferForRecording();

From bc9b5f154a30ed081c6f1e43b2f1c0ac6874d380 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 15 May 2025 15:59:03 +0200
Subject: [PATCH 126/346] make the default memcpy IUTilities buffer streaming
 callback public (its useful externally too)

---
 include/nbl/video/utilities/IUtilities.h | 34 +++++++++++-------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 09877b0d8f..00776ba01d 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -436,6 +436,18 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,callback);
         }
 
+        //
+        class CMemcpyUpstreamingDataProducer final : public IUpstreamingDataProducer
+        {
+            public:
+                inline uint32_t operator()(void* dst, const size_t offsetInRange, const uint32_t blockSize) override
+                {
+                    memcpy(dst,reinterpret_cast<const uint8_t*>(data)+offsetInRange,blockSize);
+                    return blockSize;
+                }
+
+                const void* data;
+        };
         //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer`.
         //! Returns same as `updateBufferRangeViaStagingBuffer` with a callback instead of a pointer, make sure to submit with `nextSubmit.popSubmit()` after this function returns.
         //! Parameters:
@@ -448,25 +460,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         template<typename IntendedSubmitInfo> requires std::is_same_v<std::decay_t<IntendedSubmitInfo>,SIntendedSubmitInfo>
         inline bool updateBufferRangeViaStagingBuffer(IntendedSubmitInfo&& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
-            // We check the guarantees of our documentation with the asserts while we're at it
-#ifdef _NBL_DEBUG
-            size_t prevRangeEnd = 0;
-#endif
-
-            auto retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,wrapUpstreamingDataProducerLambda(
-                [&](void* dst, const size_t offsetInRange, const uint32_t blockSize) -> uint32_t
-                {
-#ifdef _NBL_DEBUG
-                    assert(offsetInRange==prevRangeEnd);
-                    prevRangeEnd = offsetInRange+blockSize;
-#endif
-                    memcpy(dst,reinterpret_cast<const uint8_t*>(data)+offsetInRange,blockSize);
-                    return blockSize;
-                }
-            ));
-#ifdef _NBL_DEBUG
-            assert(prevRangeEnd==bufferRange.size);
-#endif
+            CMemcpyUpstreamingDataProducer memcpyCb;
+            memcpyCb.data = data;
+            bool retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,memcpyCb);
             return retval;
         }
 

From ce884ca3b6c0818490670c3d3c5df5124a8217f2 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 15 May 2025 16:13:32 +0200
Subject: [PATCH 127/346] update boost submodule

---
 3rdparty/boost/superproject | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject
index dcc3e1ade0..3b9e116eee 160000
--- a/3rdparty/boost/superproject
+++ b/3rdparty/boost/superproject
@@ -1 +1 @@
-Subproject commit dcc3e1ade0ae8e7ea0eadc2d951efb1e53450bff
+Subproject commit 3b9e116eeee85ab8fd0d8e5a97364fff5f02eb86

From e5f610acb6a9c857c79215f8ee0a22420cde147e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 15 May 2025 18:24:26 +0200
Subject: [PATCH 128/346] Resolve issues with private submodule updates, update
 3rdparty/boost/CMakeLists.txt and refactor cmake/submodules/update.cmake,
 never touch private key during CMake configuration if updating public
 repositories

---
 3rdparty/boost/CMakeLists.txt | 140 +++++++++--------
 cmake/submodules/update.cmake | 272 +++++++++-------------------------
 2 files changed, 149 insertions(+), 263 deletions(-)

diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt
index 194ad3c35c..3c95234b8e 100644
--- a/3rdparty/boost/CMakeLists.txt
+++ b/3rdparty/boost/CMakeLists.txt
@@ -1,65 +1,38 @@
-set(BOOST_PREPROCESSOR_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/libs/preprocessor/include" CACHE PATH "" FORCE)
-
-get_filename_component(_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_ "${BOOST_PREPROCESSOR_INCLUDE}" ABSOLUTE)
-get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
-set(BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}/boost" CACHE INTERNAL "" FORCE)
-
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake")
-	
-	file(GLOB_RECURSE BOOST_HEADERS_REC_REL RELATIVE "${BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH}" "${BOOST_PREPROCESSOR_INCLUDE}/*")
-	
-	foreach(BOOST_HEADER_REL IN LISTS BOOST_HEADERS_REC_REL)
-		LIST_BUILTIN_RESOURCE(BOOST_RESOURCES_TO_EMBED "${BOOST_HEADER_REL}")
-	endforeach()
-
-    ADD_CUSTOM_BUILTIN_RESOURCES(boostBuiltinResourceData BOOST_RESOURCES_TO_EMBED "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}" "boost" "boost::builtin" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "STATIC" "INTERNAL")
-endif()
-
 get_filename_component(NBL_BOOST_WAVE_DEP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dep/wave.cmake" ABSOLUTE)
 
-if(NOT EXISTS "${NBL_BOOST_WAVE_DEP_FILE}")
-	message(FATAL_ERROR "Internal error, generate NBL_BOOST_WAVE_DEP_FILE by enabling NBL_BOOST_GENERATE_DEP_LIST!")
-endif()
-
-set(BOOST_STAGEDIR "${CMAKE_CURRENT_BINARY_DIR}/boost/superproject/stage")
-include("${NBL_BOOST_WAVE_DEP_FILE}")
-
-foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS)
-	add_subdirectory(superproject/libs/${BOOST_LIB} EXCLUDE_FROM_ALL)
-endforeach()
-
-add_subdirectory(superproject/libs/wave EXCLUDE_FROM_ALL)
-
-list(APPEND NBL_BOOST_TARGETS boost_wave) # wave
-foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS)
-	if(TARGET boost_${BOOST_LIB}) # wave's deps
-		list(APPEND NBL_BOOST_TARGETS boost_${BOOST_LIB})
-	endif()
-endforeach()
-
-set(NBL_BOOST_TARGETS 
-	${NBL_BOOST_TARGETS}
-PARENT_SCOPE)
-
 # Boost uses it's own tool for generating dependency list for targets, therefore we 
 # can make sure manually added dependency subdirectories for a library are valid
 # https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep
 
 if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
-	if(WIN32)
-		set(NBL_BOOSTDEP_EXE "boostdep.exe")
-	else()
-		set(NBL_BOOSTDEP_EXE "boostdep")
+	if(NOT WIN32)
+		message(FATAL_ERROR "NBL_BOOST_GENERATE_DEP_LIST only for Windows host!")
 	endif()
-	
-	set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}")
-	
+
 	macro(NBL_BOOST_EXECUTE)
 		execute_process(COMMAND ${ARGV} WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject")
 	endmacro()
 
+	NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE)
+	string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}")
+
+	foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST})
+		string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}")
+		list(APPEND BOOST_SUBMODULES "${CMAKE_MATCH_1}")
+	endforeach()
+
+	# sync & force update of all boost modules first for the tool purpose (sry guys who use the tool, you need to clone all, I want to keep it simple)
+	NBL_BOOST_EXECUTE(git submodule sync)
+	list(APPEND BOOST_FORCE_ALL_CONFIG -c url.https://github.com/.insteadOf=git@github.com:)
+	foreach(SUBMODULE ${BOOST_SUBMODULES})
+		list(APPEND BOOST_FORCE_ALL_CONFIG -c submodule.${SUBMODULE}.update=checkout)
+	endforeach()
+
+	NBL_BOOST_EXECUTE(git ${BOOST_FORCE_ALL_CONFIG} submodule update --init --recursive -f)
+
+	# build boost dep executable
+	set(NBL_BOOSTDEP_EXE "boostdep.exe")
+	set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}")
 	if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}")
 		NBL_BOOST_EXECUTE(cmd /C bootstrap.bat)
 		NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build)
@@ -68,6 +41,7 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
 		NBL_BOOST_EXECUTE(git reset --hard)
 	endif()
 
+	# get wave dependency info
 	NBL_BOOST_EXECUTE("${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave
 		OUTPUT_VARIABLE NBL_OUTPUT_VAR
 	)
@@ -81,22 +55,66 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
 	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)")
 	string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}")
 
-	# NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*, use when updating boost to more recent version
+	# update boost .gitmodules configuration, discard all but modules reported by wave
+	# NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*,
+	# use when updating boost to more recent version
 	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})")
 
-	NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE)
-
-	string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}")
-
 	message(STATUS "Updating boost .gitmodules")
-	foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST})
-		string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}")
-		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${CMAKE_MATCH_1}.update none) # fallback, ignore all
+	foreach(SUBMODULE ${BOOST_SUBMODULES})
+		# 1) fallback, ignore all
+		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update none)
 	endforeach()
 
 	foreach(NAME ${NBL_BOOST_LIBS})
 		string(REPLACE "/" "_" SUBMODULE "${NAME}")
-		message(STATUS "BOOST SUBMODULE = ${SUBMODULE}")
-		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout) # pick only those reported by the module we use
+		message(STATUS "WAVE BOOST DEP SUBMODULE = ${SUBMODULE}")
+		# 2) pick only submodules reported by wave
+		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout)
+	endforeach()
+	# 3) and the top module itself
+	NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.wave.update checkout)
+endif()
+
+set(BOOST_PREPROCESSOR_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/libs/preprocessor/include" CACHE PATH "" FORCE)
+
+get_filename_component(_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_ "${BOOST_PREPROCESSOR_INCLUDE}" ABSOLUTE)
+get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+set(BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}/boost" CACHE INTERNAL "" FORCE)
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake")
+	
+	file(GLOB_RECURSE BOOST_HEADERS_REC_REL RELATIVE "${BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH}" "${BOOST_PREPROCESSOR_INCLUDE}/*")
+	
+	foreach(BOOST_HEADER_REL IN LISTS BOOST_HEADERS_REC_REL)
+		LIST_BUILTIN_RESOURCE(BOOST_RESOURCES_TO_EMBED "${BOOST_HEADER_REL}")
 	endforeach()
-endif()
\ No newline at end of file
+
+    ADD_CUSTOM_BUILTIN_RESOURCES(boostBuiltinResourceData BOOST_RESOURCES_TO_EMBED "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}" "boost" "boost::builtin" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "STATIC" "INTERNAL")
+endif()
+
+if(NOT EXISTS "${NBL_BOOST_WAVE_DEP_FILE}")
+	message(FATAL_ERROR "Internal error, generate NBL_BOOST_WAVE_DEP_FILE by enabling NBL_BOOST_GENERATE_DEP_LIST!")
+endif()
+
+set(BOOST_STAGEDIR "${CMAKE_CURRENT_BINARY_DIR}/boost/superproject/stage")
+include("${NBL_BOOST_WAVE_DEP_FILE}")
+
+foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS)
+	add_subdirectory(superproject/libs/${BOOST_LIB} EXCLUDE_FROM_ALL)
+endforeach()
+
+add_subdirectory(superproject/libs/wave EXCLUDE_FROM_ALL)
+
+list(APPEND NBL_BOOST_TARGETS boost_wave) # wave
+foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS)
+	if(TARGET boost_${BOOST_LIB}) # wave's deps
+		list(APPEND NBL_BOOST_TARGETS boost_${BOOST_LIB})
+	endif()
+endforeach()
+
+set(NBL_BOOST_TARGETS 
+	${NBL_BOOST_TARGETS}
+PARENT_SCOPE)
\ No newline at end of file
diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake
index d0365c72ca..5d2474330e 100644
--- a/cmake/submodules/update.cmake
+++ b/cmake/submodules/update.cmake
@@ -1,223 +1,91 @@
-include(ProcessorCount)
 find_package(Git REQUIRED)
 
-option(NBL_UPDATE_GIT_SUBMODULE "Turn this ON to let CMake update all public submodules for you" ON)
-option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "Submodules will be updated with --force flag if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, use with caution - if there are any uncommited files in submodules' working tree they will be removed!" OFF)
-option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync initialized submodule paths if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, this is useful when any submodule remote path got modified and you want to apply this modification to your local repository. Turning NBL_FORCE_ON_UPDATE_GIT_SUBMODULE implies this option" OFF)
-option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "Turn this ON to attempt to update private Nabla submodules" OFF)
-option(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL "Turn this ON to prevent CMake from executing git submodules update or sync in a separate shell - be aware that the interaction with shell will be impossible in case of paraphrase prompt request of your key!" ON)
-option(NBL_CI_GIT_SUBMODULES_SHALLOW "" OFF)
+option(NBL_UPDATE_GIT_SUBMODULE "Turn ON to update submodules, only public by default" ON)
+option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "NBL_UPDATE_GIT_SUBMODULE logic with --force flag" OFF)
+option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync submodule URLs" OFF)
+option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "NBL_UPDATE_GIT_SUBMODULE logic but includes private submodules, for Nabla devs" OFF)
+option(NBL_SUBMODULES_SHALLOW "NBL_UPDATE_GIT_SUBMODULE logic with --depth=1" OFF)
 
-# TODO: replace all of this command recording & proxy logic with executing single recurse one-liner including -c options for private submodules
-# once we have relative URLs + all .gitmodules configs are polished (so basically we don't have to set some config options on fly)
-
-if(NOT DEFINED NBL_ROOT_PATH)
+if(NBL_UPDATE_GIT_SUBMODULE)
+block()
 	get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
-endif()
-
-if(NOT DEFINED THIRD_PARTY_SOURCE_DIR)
 	set(THIRD_PARTY_SOURCE_DIR "${NBL_ROOT_PATH}/3rdparty")
-endif()
-
-if(NOT DEFINED NBL_ROOT_PATH_BINARY)
-	set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules")
-endif()
-
-if(NOT DEFINED NBL_BUILD_EXAMPLES)
-	set(NBL_BUILD_EXAMPLES ON)
-endif()
-
-function(NBL_UPDATE_SUBMODULES)
-	ProcessorCount(_GIT_SUBMODULES_JOBS_AMOUNT_)
-
-	set(PRIVATE_SUBMODULES
-		Ditt-Reference-Scenes
-	)
-
-	foreach(NBL_P_SUBMODULE_NAME ${PRIVATE_SUBMODULES})
-		if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
-			list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=checkout")
-		else()
-			list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=none")
-		endif()
-	endforeach()
-	
-	if(NBL_CI_GIT_SUBMODULES_SHALLOW)
-		set(NBL_SHALLOW "--depth=1")
-	else()
-		set(NBL_SHALLOW "")
+
+	if(NOT DEFINED NBL_ROOT_PATH_BINARY)
+		set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules")
 	endif()
-	
-	if(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE)
-		set(NBL_FORCE "--force")
-	else()
-		set(NBL_FORCE "")
+
+	if(NOT DEFINED NBL_BUILD_EXAMPLES)
+		set(NBL_BUILD_EXAMPLES ON)
 	endif()
 
-	macro(NBL_WRAPPER_COMMAND_EXCLUSIVE GIT_RELATIVE_ENTRY GIT_SUBMODULE_PATH SHOULD_RECURSIVE EXCLUDE_SUBMODULE_PATHS)
-		set(EXCLUDE_SUBMODULE_PATHS ${EXCLUDE_SUBMODULE_PATHS})
-		set(SHOULD_RECURSIVE ${SHOULD_RECURSIVE})
-		
-		if("${EXCLUDE_SUBMODULE_PATHS}" STREQUAL "")
-			set(NBL_EXCLUDE "")
-		else()
-			foreach(EXCLUDE_SUBMODULE_PATH ${EXCLUDE_SUBMODULE_PATHS})
-				string(APPEND NBL_EXCLUDE "-c submodule.\"${EXCLUDE_SUBMODULE_PATH}\".update=none ")
-			endforeach()
-			
-			string(STRIP "${NBL_EXCLUDE}" NBL_EXCLUDE)
-		endif()
-
-		if(SHOULD_RECURSIVE)
-			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
-		else()
-			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
-		endif()
-		
-		string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "${_NBL_EXECUTE_COMMAND_}\n")
-		
-		unset(NBL_EXCLUDE)
-	endmacro()
-	
-	set(_NBL_UPDATE_SUBMODULES_CMD_NAME_ "nbl-update-submodules")
-	set(_NBL_UPDATE_SUBMODULES_CMD_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmd")
-	get_filename_component(_NBL_UPDATE_IMPL_CMAKE_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmake" ABSOLUTE)
-	
-	# Proxy script for inclusive submodule updating
-	string(APPEND NBL_IMPL_SCRIPT "set(NBL_ROOT_PATH \"${NBL_ROOT_PATH}\")\nset(_GIT_SUBMODULES_JOBS_AMOUNT_ ${_GIT_SUBMODULES_JOBS_AMOUNT_})\nset(GIT_EXECUTABLE \"${GIT_EXECUTABLE}\")\nset(NBL_SHALLOW \"${NBL_SHALLOW}\")\nset(NBL_FORCE \"${NBL_FORCE}\")\n\n")
-	string(APPEND NBL_IMPL_SCRIPT
-[=[
-if(NOT DEFINED GIT_RELATIVE_ENTRY)
-	message(FATAL_ERROR "GIT_RELATIVE_ENTRY must be defined to use this script!")
-endif()
+	# we force HTTPS traffic for all *public* submodules we update from CMake
+	# NOTE: it *doesn't* rewrite destination URLs after checkout, if you eg. 
+	# clone with SSH you end up with it anyway, this way your private key 
+	# is never involved during CMake configuration, unless you
+	# use NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE
 
-if(NOT DEFINED INCLUDE_SUBMODULE_PATHS)
-	message(FATAL_ERROR "INCLUDE_SUBMODULE_PATHS must be defined to use this script!")
-endif()
+	# Private refs (*), exclude from public update
+	list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"Ditt-Reference-Scenes\".update=none)
 
-# update an inclusive submodule first
-execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}" submodule update --init "${GIT_RELATIVE_ENTRY}")
+	unset(NBL_UPDATE_OPTIONS)
 
-if("${INCLUDE_SUBMODULE_PATHS}" STREQUAL "")
-	set(NBL_SUBMODULE_UPDATE_CONFIG_ENTRY "")
-else()
-	execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}" config --file .gitmodules --get-regexp path
-		OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE
-	)
+	if(NBL_SUBMODULES_SHALLOW)
+		list(APPEND NBL_UPDATE_OPTIONS --depth=1)
+	endif()
 
-	string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}")
-	
-	foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST})
-		string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}")
-		list(APPEND NBL_ALL_SUBMODULES "${CMAKE_MATCH_1}")
-	endforeach()
-	
-	foreach(NBL_SUBMODULE_NAME ${NBL_ALL_SUBMODULES})		
-		list(FIND INCLUDE_SUBMODULE_PATHS "${NBL_SUBMODULE_NAME}" NBL_FOUND)
-		
-		if("${NBL_FOUND}" STREQUAL "-1")
-			list(APPEND NBL_CONFIG_SETUP_CMD "-c;submodule.${NBL_SUBMODULE_NAME}.update=none") # filter submodules - only those on the INCLUDE_SUBMODULE_PATHS list will be updated when recursive update is requested, all left will be skipped
-		endif()
-	endforeach()
-endif()
-
-execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} --recursive ${NBL_SHALLOW} ${NBL_FORCE}
-	WORKING_DIRECTORY "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}"
-)
-]=]
-)
-	file(WRITE "${_NBL_UPDATE_IMPL_CMAKE_FILE_}" "${NBL_IMPL_SCRIPT}")
-	
-	macro(NBL_WRAPPER_COMMAND_INCLUSIVE GIT_RELATIVE_ENTRY INCLUDE_SUBMODULE_PATHS)
-		string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "\"${CMAKE_COMMAND}\" \"-DGIT_RELATIVE_ENTRY=${GIT_RELATIVE_ENTRY}\" \"-DINCLUDE_SUBMODULE_PATHS=${INCLUDE_SUBMODULE_PATHS}\" -P \"${_NBL_UPDATE_IMPL_CMAKE_FILE_}\"\n")
+	if(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE)
+		list(APPEND NBL_UPDATE_OPTIONS --force)
+	endif()
+
+	if(NOT NBL_BUILD_EXAMPLES)
+		list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"examples_tests\".update=none)
+	endif()
+
+	macro(NBL_GIT_COMMAND)
+		execute_process(COMMAND "${GIT_EXECUTABLE}" ${ARGV})
 	endmacro()
+
+	if(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE)
+		message(STATUS "Syncing Public submodules")
+		NBL_GIT_COMMAND(${NBL_CONFIG_SUBMODULE} submodule sync --recursive WORKING_DIRECTORY "${NBL_ROOT_PATH}")
+	endif()
 	
-	if(NBL_UPDATE_GIT_SUBMODULE)
-		execute_process(COMMAND ${CMAKE_COMMAND} -E echo "All submodules are about to get updated and initialized in repository because NBL_UPDATE_GIT_SUBMODULE is turned ON!")
-		
-		include("${THIRD_PARTY_SOURCE_DIR}/boost/dep/wave.cmake")
-		
-		macro(NBL_IMPL_INIT_COMMON_SUBMODULES)
-			# 3rdparty except boost & gltf
-			set(NBL_3RDPARTY_MODULES_TO_SKIP
-				3rdparty/boost/superproject # a lot of submodules we don't use
-				3rdparty/glTFSampleModels # more then 2GB waste of space (disk + .gitmodules data)
-			)
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./3rdparty TRUE "${NBL_3RDPARTY_MODULES_TO_SKIP}")
-			
-			# boost's 3rdparties, special case
-			# TODO: fork boost and update .gitmodules to cover only libs we want to use
-			set(NBL_BOOST_LIBS_TO_INIT ${NBL_BOOST_LIBS} wave numeric_conversion) # wave and all of its deps, numeric_conversion is nested in conversion submodule (for some reason boostdep tool doesn't output it properly)
-			foreach(NBL_TARGET ${NBL_BOOST_LIBS_TO_INIT})
-				list(APPEND NBL_BOOST_SUBMODULES_TO_INIT ${NBL_TARGET})
-			endforeach()
-			NBL_WRAPPER_COMMAND_INCLUSIVE(3rdparty/boost/superproject "${NBL_BOOST_SUBMODULES_TO_INIT}")
-			
-			# tests
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./tests FALSE "")
-			
-			# docker
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./docker FALSE "")
+	message(STATUS "Updating Public submodules")
+	NBL_GIT_COMMAND(-c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}")
+
+	if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
+		# NOTE: your git must be installed with default Git Bash as shell 
+		# otherwise it *may* fail, whether it works depends on your agent setup
+
+		find_package(GitBash REQUIRED)
+
+		macro(NBL_GIT_BASH_COMMAND)
+			execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c" ${ARGV})
 		endmacro()
-		
-		NBL_IMPL_INIT_COMMON_SUBMODULES()
-		
-		if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "")
-		else()
-			# NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./ci TRUE "") TODO: enable it once we merge Ditt, etc
-			
-			# examples and their media
-			if(NBL_BUILD_EXAMPLES)
-				NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "")
-			endif()
-		endif()
-				
-		file(WRITE "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}" "${_NBL_UPDATE_SUBMODULES_COMMANDS_}")
-
-		if(WIN32)
-			if(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL)
-				set(UPDATE_COMMAND
-					nbl-update-submodules.cmd
-				)
-			
-				execute_process(COMMAND ${UPDATE_COMMAND}
-					WORKING_DIRECTORY "${NBL_ROOT_PATH_BINARY}"
-					RESULT_VARIABLE _NBL_TMP_RET_CODE_
-				)
-			else()
-				find_package(GitBash REQUIRED)
-		
-				execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c"
+
+		message(STATUS "Updating Private submodules")
+		string(REPLACE ";" " " NBL_UPDATE_OPTIONS "${NBL_UPDATE_OPTIONS}")
+		set(LOG_FILE "${NBL_ROOT_PATH_BINARY}/nbl-update-private-submodules.log")
+		set(BASH_CMD
 [=[
 >&2 echo ""
 clear
-./nbl-update-submodules.cmd 2>&1 | tee nbl-update-submodules.log
-sleep 1
+{	
+	echo "=== $(date) :: Starting private submodule update ==="
+	git -c submodule.Ditt-Reference-Scenes.update=checkout -C @NBL_ROOT_PATH@/examples_tests/media submodule update --init Ditt-Reference-Scenes @NBL_UPDATE_OPTIONS@
+	# more private submodule here
+
+	echo "=== $(date) :: Created @LOG_FILE@ in your build directory. ==="
+	echo "=== $(date) :: Finished private submodule update ==="
+} 2>&1 | tee @LOG_FILE@
 clear
-tput setaf 2; echo -e "Submodules have been updated! 
-Created nbl-update-submodules.log in your build directory."
 ]=]
-					WORKING_DIRECTORY ${NBL_ROOT_PATH_BINARY}
-					OUTPUT_VARIABLE _NBL_TMP_OUTPUT_
-					RESULT_VARIABLE _NBL_TMP_RET_CODE_
-					OUTPUT_STRIP_TRAILING_WHITESPACE
-					ERROR_STRIP_TRAILING_WHITESPACE
-				)
-				
-				unset(_NBL_TMP_OUTPUT_)
-				unset(_NBL_TMP_RET_CODE_)
-			
-				message(STATUS "Generated \"${NBL_ROOT_PATH_BINARY}/nbl-update-submodules.log\"")
-			endif()
-			
-			message(STATUS "Submodules have been updated!")
-		else()
-			execute_process(COMMAND "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}")
-		endif()
-	else()
-		execute_process(COMMAND ${CMAKE_COMMAND} -E echo "NBL_UPDATE_GIT_SUBMODULE is turned OFF therefore submodules won't get updated.")
+		)
+		string(CONFIGURE "${BASH_CMD}" BASH_CMD)
+		NBL_GIT_BASH_COMMAND("${BASH_CMD}" OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE RES)
+		file(READ "${LOG_FILE}" LOG_CONTENT)
+		message(STATUS "${LOG_CONTENT}")
 	endif()
-endfunction()
-
-NBL_UPDATE_SUBMODULES()
\ No newline at end of file
+endblock()
+endif()
\ No newline at end of file

From bf9390018f84be9c762eb6c152fe17a993b4e015 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 15 May 2025 19:42:46 +0200
Subject: [PATCH 129/346] use fetch.parallel=0 in CMake update

---
 cmake/submodules/update.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake
index 5d2474330e..412cdf04e0 100644
--- a/cmake/submodules/update.cmake
+++ b/cmake/submodules/update.cmake
@@ -52,7 +52,7 @@ block()
 	endif()
 	
 	message(STATUS "Updating Public submodules")
-	NBL_GIT_COMMAND(-c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}")
+	NBL_GIT_COMMAND(-c fetch.parallel=0 -c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}")
 
 	if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
 		# NOTE: your git must be installed with default Git Bash as shell 

From 55d89c5c2e3be03e178af923f0b70dc3420f63d4 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 16 May 2025 10:09:41 +0700
Subject: [PATCH 130/346] no need to store locals in reduce

---
 .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 1043decd73..add3acc687 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -94,20 +94,20 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
-        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
         subgroup2::reduction<params_lv0_t> reduction0;
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
-            scan_local[idx] = reduction0(scan_local[idx]);
+            vector_lv0_t scan_local;
+            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            scan_local = reduction0(scan_local);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -227,20 +227,20 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
         subgroup2::reduction<params_lv0_t> reduction0;
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
-            scan_local[idx] = reduction0(scan_local[idx]);
+            vector_lv0_t scan_local;
+            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            scan_local = reduction0(scan_local);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();

From 4e4f26e994a2ca5c5009ba3768b0121b627f50bd Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 16 May 2025 11:18:51 +0700
Subject: [PATCH 131/346] added workgroup accessor concepts, refactor accessor
 usage

---
 examples_tests                                |  2 +-
 .../accessors/workgroup_arithmetic.hlsl       | 57 ++++++++++++++++
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   |  7 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 66 +++++++++----------
 src/nbl/builtin/CMakeLists.txt                |  9 +++
 5 files changed, 104 insertions(+), 37 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl

diff --git a/examples_tests b/examples_tests
index 1de31ddfd7..e828dc49ef 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1de31ddfd725009bd650f1fe80f1c4a8c2e6a14a
+Subproject commit e828dc49ef0a223dcbb8b4af8d722974747f29ee
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
new file mode 100644
index 0000000000..de5e5a3c35
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
@@ -0,0 +1,57 @@
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+#define NBL_CONCEPT_NAME ArithmeticSharedMemoryAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
+#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+#define NBL_CONCEPT_NAME ArithmeticDataAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
+#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index 3b4a028d2c..d0a26cdf94 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -8,6 +8,7 @@
 #include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/workgroup/ballot.hlsl"
 #include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
 
 
@@ -21,7 +22,7 @@ namespace workgroup2
 template<class Config, class BinOp, class device_capabilities=void>
 struct reduction
 {
-    template<class DataAccessor, class ScratchAccessor>
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
@@ -32,7 +33,7 @@ struct reduction
 template<class Config, class BinOp, class device_capabilities=void>
 struct inclusive_scan
 {
-    template<class DataAccessor, class ScratchAccessor>
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
@@ -43,7 +44,7 @@ struct inclusive_scan
 template<class Config, class BinOp, class device_capabilities=void>
 struct exclusive_scan
 {
-    template<class DataAccessor, class ScratchAccessor>
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index add3acc687..d53bfd6000 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -43,9 +43,9 @@ struct reduce<Config, BinOp, 1, device_capabilities>
 
         subgroup2::reduction<params_t> reduction;
         vector_t value;
-        dataAccessor.get(workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.template get<vector_t>(workgroup::SubgroupContiguousIndex(), value);
         value = reduction(value);
-        dataAccessor.set(workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.template set<vector_t>(workgroup::SubgroupContiguousIndex(), value);
     }
 };
 
@@ -63,7 +63,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
 
         vector_t value;
-        dataAccessor.get(workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.template get<vector_t>(workgroup::SubgroupContiguousIndex(), value);
         if (Exclusive)
         {
             subgroup2::exclusive_scan<params_t> excl_scan;
@@ -74,7 +74,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
             subgroup2::inclusive_scan<params_t> incl_scan;
             value = incl_scan(value);
         }
-        dataAccessor.set(workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
+        dataAccessor.template set<vector_t>(workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
     }
 };
 
@@ -101,13 +101,13 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t scan_local;
-            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -119,9 +119,9 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
-            scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+            scratchAccessor.template set<scalar_t>(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -130,8 +130,8 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             scalar_t reduce_val;
-            scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val);
-            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
+            scratchAccessor.template get<scalar_t>(glsl::gl_SubgroupInvocationID(),reduce_val);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote<vector_lv0_t>(reduce_val));
         }
     }
 };
@@ -158,13 +158,13 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -177,10 +177,10 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]);
             vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), lv1_val, bool(invocationIndex));
             shiftedInput = inclusiveScan1(shiftedInput);
-            scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]);
+            scratchAccessor.template set<scalar_t>(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -190,7 +190,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         {
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
             scalar_t left;
-            scratchAccessor.get(virtualSubgroupID,left);
+            scratchAccessor.template get<scalar_t>(virtualSubgroupID,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
@@ -204,7 +204,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     scan_local[idx][i] = binop(left, scan_local[idx][i]);
             }
-            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
         }
     }
 };
@@ -234,13 +234,13 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t scan_local;
-            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -252,12 +252,12 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
-                scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -269,9 +269,9 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]);
             lv2_val = reduction2(lv2_val);
-            scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
+            scratchAccessor.template set<scalar_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -280,8 +280,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             scalar_t reduce_val;
-            scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val);
-            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
+            scratchAccessor.template get<scalar_t>(glsl::gl_SubgroupInvocationID(),reduce_val);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }
 };
@@ -310,13 +310,13 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
             scan_local[idx] = inclusiveScan0(scan_local[idx]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -329,12 +329,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = inclusiveScan1(lv1_val);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
-                scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -347,7 +347,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
             vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(invocationIndex));
             shiftedInput = inclusiveScan2(shiftedInput);
 
@@ -356,10 +356,10 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::SubgroupsPerVirtualWorkgroup; i++)
             {
                 scalar_t last_val;
-                scratchAccessor.get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val);
+                scratchAccessor.template get<scalar_t>((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val);
                 scalar_t val = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(i));
                 val = binop(last_val, shiftedInput[Config::ItemsPerInvocation_2-1]);
-                scratchAccessor.set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val);
+                scratchAccessor.template set<scalar_t>((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -370,7 +370,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         {
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
             const scalar_t left;
-            scratchAccessor.get(virtualSubgroupID, left);
+            scratchAccessor.template get<scalar_t>(virtualSubgroupID, left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
@@ -384,7 +384,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     scan_local[idx][i] = binop(left, scan_local[idx][i]);
             }
-            dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
         }
     }
 };
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 9333a0d3b4..a6405a3c99 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -330,6 +330,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability_impl.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl")
+#subgroup2
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/ballot.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability_impl.hlsl")
 #shared header between C++ and HLSL
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h")
 #workgroup
@@ -341,6 +345,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl")
+#workgroup2
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl")
 #Extensions
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl")
@@ -362,6 +370,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_i
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl")
 #tgmath
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/impl.hlsl")

From 6884d4548e758c6591b7b291e2895457de4a36ab Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 16 May 2025 18:19:55 +0700
Subject: [PATCH 132/346] Add non const computeDependants to IAsset and its
 child classes

---
 include/nbl/asset/IAsset.h                    |  8 ++-
 include/nbl/asset/ICPUAccelerationStructure.h |  5 ++
 include/nbl/asset/ICPUAnimationLibrary.h      |  9 +++
 include/nbl/asset/ICPUBuffer.h                |  5 ++
 include/nbl/asset/ICPUBufferView.h            | 14 ++++-
 include/nbl/asset/ICPUComputePipeline.h       | 36 +++++++----
 include/nbl/asset/ICPUDescriptorSet.h         |  1 +
 include/nbl/asset/ICPUDescriptorSetLayout.h   | 28 ++++++---
 include/nbl/asset/ICPUGraphicsPipeline.h      | 23 +++++--
 include/nbl/asset/ICPUImage.h                 |  5 ++
 include/nbl/asset/ICPUImageView.h             | 14 ++++-
 include/nbl/asset/ICPUMesh.h                  |  5 ++
 include/nbl/asset/ICPUMeshBuffer.h            |  5 ++
 include/nbl/asset/ICPUPipeline.h              |  2 +-
 include/nbl/asset/ICPUPipelineCache.h         |  5 ++
 include/nbl/asset/ICPUPipelineLayout.h        | 13 ++++
 include/nbl/asset/ICPURayTracingPipeline.h    | 26 +++++---
 include/nbl/asset/ICPURenderpass.h            |  5 ++
 .../asset/ICPURenderpassIndependentPipeline.h |  5 ++
 include/nbl/asset/ICPUSampler.h               |  5 ++
 include/nbl/asset/ICPUSkeleton.h              | 16 ++++-
 include/nbl/asset/IShader.h                   | 15 ++++-
 src/nbl/asset/ICPUDescriptorSet.cpp           | 62 +++++++++++--------
 23 files changed, 248 insertions(+), 64 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index c3950c4912..0e91b99c36 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -158,7 +158,13 @@ class IAsset : virtual public core::IReferenceCounted
 
 		virtual core::unordered_set<const IAsset*> computeDependants() const = 0;
 
-    virtual bool valid() const = 0;
+		virtual core::unordered_set<IAsset*> computeDependants() = 0;
+
+    virtual bool valid() const
+    {
+        //TODO(kevinyu): Temporary set this to true to make changes compile. Will revisit this later for each asset
+        return true;
+    }
 
     protected:
 		inline IAsset() = default;
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index affd165667..3ac794a888 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -141,6 +141,11 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 			return {};
 		}
 
+		inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return {};
+		}
+
 		inline core::blake3_hash_t computeContentHash() const override
 		{
 			if (!missingContent())
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 5fea370b63..8a6cdaf52a 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -100,6 +100,15 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 		{
 			return { m_keyframeStorageBinding.buffer.get(), m_timestampStorageBinding.buffer.get(), m_animationStorageRange.buffer.get() };
 		}
+
+  private:
+
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, ICPUAnimationLibrary>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        return core::unordered_set<asset_ptr_t>{ self->m_keyframeStorageBinding.buffer.get(), self->m_timestampStorageBinding.buffer.get(), self->m_animationStorageRange.buffer.get() };
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 2d495ef02e..0ad1d7bf48 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -80,6 +80,11 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
             return {};
         }
 
+        inline core::unordered_set<IAsset*> computeDependants() override
+        {
+            return {};
+        }
+
         inline core::blake3_hash_t computeContentHash() const override
         {
             core::blake3_hasher hasher;
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index 7f3f676695..55d50356c1 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -30,7 +30,12 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-        return { m_buffer.get() };
+			return computeDependantsImpl(this);
+		}
+
+    inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return computeDependantsImpl(this);
 		}
 
 		ICPUBuffer* getUnderlyingBuffer() 
@@ -54,6 +59,13 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 	protected:
 		virtual ~ICPUBufferView() = default;
 
+  private:
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, ICPUBufferView>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        return core::unordered_set<asset_ptr_t>{ self->m_buffer.get() };
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 8d8b343a3d..f6b689857f 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -25,31 +25,28 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
             return core::smart_refctd_ptr<ICPUComputePipeline>(retval,core::dont_grab);
         }
 
-        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
-        {
-            auto newPipeline = new ICPUComputePipeline(layout.get());
-            newPipeline->m_specInfo = m_specInfo.clone(depth);
-            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
-        }
-
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
         //!
-        virtual core::unordered_set<const IAsset*> computeDependants() const override
+        inline core::unordered_set<const IAsset*> computeDependants() const override
+        {
+            return computeDependantsImpl(this);
+        }
+
+        inline core::unordered_set<IAsset*> computeDependants() override
         {
-            return {m_layout.get(), m_specInfo.shader.get()};
+            return computeDependantsImpl(this);
         }
 
-        inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
         {
-            if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable())
+            if (stage==hlsl::ShaderStage::ESS_COMPUTE)
                 return {&m_specInfo,1};
             return {};
         }
 
-
-        inline virtual bool valid() const override final
+        inline bool valid() const override
         {
             if (!m_layout) return false;
             if (!m_layout->valid()) return false;
@@ -64,10 +61,23 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
     private:
         SShaderSpecInfo m_specInfo;
 
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto newPipeline = new ICPUComputePipeline(layout.get());
+            newPipeline->m_specInfo = m_specInfo.clone(depth);
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
+        }
+
         explicit ICPUComputePipeline(const ICPUPipelineLayout* layout):
           base_t(layout, {})
           {}
 
+        template <typename Self>
+          requires(std::same_as<std::remove_cv_t<Self>, ICPUComputePipeline>)
+        static auto computeDependantsImpl(Self* self) {
+            using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+            return core::unordered_set<asset_ptr_t>{ self->m_layout.get(), self->m_specInfo.shader.get() };
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 77640b8f9f..c8a6f68d22 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -78,6 +78,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 		core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override;
 
 		core::unordered_set<const IAsset*> computeDependants() const override;
+		core::unordered_set<IAsset*> computeDependants() override;
 
 	protected:
 		virtual ~ICPUDescriptorSet() = default;
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 2ddf1e26be..b2c06792d6 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -59,18 +59,32 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
 
         core::unordered_set<const IAsset*> computeDependants() const override
         {
-            if (!m_immutableSamplers) return {};
-            core::unordered_set<const IAsset*> dependants;
-            for (const auto& sampler: m_immutableSamplers)
-            {
-                dependants.insert(sampler.get());
-            }
-            return dependants;
+            return computeDependantsImpl(this);
+        }
+
+        core::unordered_set<IAsset*> computeDependants() override
+        {
+            return computeDependantsImpl(this);
         }
 
 	protected:
 		virtual ~ICPUDescriptorSetLayout() = default;
 
+      
+  private:
+      template <typename Self>
+        requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSetLayout>)
+      static auto computeDependantsImpl(Self* self) {
+          using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+          core::unordered_set<asset_ptr_t> dependants;
+          if (!self->m_immutableSamplers) return dependants;
+          for (const auto& sampler: self->m_immutableSamplers)
+          {
+              dependants.insert(sampler.get());
+          }
+          return dependants;
+      }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 0629f82f1c..dcdcfb495e 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -43,12 +43,14 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        virtual core::unordered_set<const IAsset*> computeDependants() const override
+        inline core::unordered_set<const IAsset*> computeDependants() const override
         {
-            core::unordered_set<const IAsset*> dependants = { m_layout.get(), m_renderpass.get()};
-            for (const auto& info : m_specInfos)
-              if (info.shader) dependants.insert(info.shader.get());
-            return dependants;
+            return computeDependantsImpl(this);
+        }
+
+        inline core::unordered_set<IAsset*> computeDependants() override
+        {
+            return computeDependantsImpl(this);
         }
 
         inline SCachedCreationParams& getCachedCreationParams()
@@ -69,6 +71,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         inline virtual bool valid() const override final
         {
             if (!m_layout) return false;
+            if (!m_layout->valid())return false;
 
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
             if (!m_renderpass || m_params.subpassIx >= m_renderpass->getSubpassCount()) return false;
@@ -108,6 +111,16 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
                 return hlsl::ShaderStage::ESS_UNKNOWN;
             return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
         }
+
+        template <typename Self>
+          requires(std::same_as<std::remove_cv_t<Self>, ICPUGraphicsPipeline>)
+        static auto computeDependantsImpl(Self* self) {
+            using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+            core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get(), self->m_renderpass.get()};
+            for (const auto& info : self->m_specInfos)
+              if (info.shader) dependants.insert(info.shader.get());
+            return dependants;
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index 2527fd1ecb..b732e50492 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -51,6 +51,11 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
         return {};
 		}
 
+    inline core::unordered_set<IAsset*> computeDependants() override
+		{
+        return {};
+		}
+
 		core::blake3_hash_t computeContentHash() const override;
 
 		// Having regions specififed to upload is optional! So to have content missing we must have regions but no buffer content
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 6b3d562a60..9639df6eb9 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -51,7 +51,12 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-        return { params.image.get() };
+			return computeDependantsImpl(this);
+		}
+
+    inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return computeDependantsImpl(this);
 		}
 
 		//!
@@ -70,6 +75,13 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 	protected:
 		virtual ~ICPUImageView() = default;
 
+  private:
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, ICPUImageView>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        return core::unordered_set<asset_ptr_t>{ self->params.image.get() };
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index 2648900ccc..e9aaf53ba4 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -87,6 +87,11 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
             return {};
         }
 
+        inline core::unordered_set<IAsset*> computeDependants() override
+        {
+            return {};
+        }
+
 	protected:
 
 	private:
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 61e9168a98..c44d055c18 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -617,6 +617,11 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             return {};
         }
 
+        inline core::unordered_set<IAsset*> computeDependants() override
+        {
+            return {};
+        }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index ae2c64372d..8fe7e38391 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -131,7 +131,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
         inline std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return {};
-            const auto specInfo = static_cast<const this_t*>(this)->getSpecInfo(stage);
+            const auto specInfo = const_cast<const this_t*>(this)->getSpecInfo(stage);
             return { const_cast<SShaderSpecInfo*>(specInfo.data()), specInfo.size() };
         }
 
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 6fc019ce7f..0ff912603d 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -65,6 +65,11 @@ class ICPUPipelineCache final : public IPreHashed
 			return {};
 		}
 
+	  inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return {};
+		}
+
 		//
 		inline core::blake3_hash_t computeContentHash() const override
 		{
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index 994d480b17..e755a22f07 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -79,6 +79,19 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
+      template <typename Self>
+        requires(std::same_as<std::remove_cv_t<Self>, ICPUPipelineLayout>)
+      static auto computeDependantsImpl(Self* self) {
+          using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+          core::unordered_set<asset_ptr_t> dependants;
+          for (auto i = 0; i < self->m_descSetLayouts.size(); i++)
+          {
+              if (self->m_descSetLayouts[i]) continue;
+              dependants.insert(self->m_descSetLayouts[i].get());
+          }
+          return dependants;
+      }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 5d975fa4dc..2b04a2f41b 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -57,14 +57,11 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         inline E_TYPE getAssetType() const override { return AssetType; }
         
         virtual core::unordered_set<const IAsset*> computeDependants() const override final {
-            core::unordered_set<const IAsset*> dependants;
-            dependants.insert(m_raygen.shader.get());
-            for (const auto& missInfo : m_misses) dependants.insert(missInfo.shader.get());
-            for (const auto& anyHitInfo : m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get());
-            for (const auto& closestHitInfo : m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get());
-            for (const auto& intersectionInfo : m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get());
-            for (const auto& callableInfo : m_callables) dependants.insert(callableInfo.shader.get());
-            return dependants;
+            return computeDependantsImpl(this);
+        }
+
+        virtual core::unordered_set<IAsset*> computeDependants() override final {
+            return computeDependantsImpl(this);
         }
 
         inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
@@ -108,6 +105,19 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             : base_t(layout, {})
             {}
 
+        template <typename Self>
+          requires(std::same_as<std::remove_cv_t<Self>, ICPURayTracingPipeline>)
+        static auto computeDependantsImpl(Self* self) {
+            using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+            core::unordered_set<asset_ptr_t> dependants;
+            dependants.insert(self->m_raygen.shader.get());
+            for (const auto& missInfo : self->m_misses) dependants.insert(missInfo.shader.get());
+            for (const auto& anyHitInfo : self->m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get());
+            for (const auto& closestHitInfo : self->m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get());
+            for (const auto& intersectionInfo : self->m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get());
+            for (const auto& callableInfo : self->m_callables) dependants.insert(callableInfo.shader.get());
+            return dependants;
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index bbb2e5003f..9cc73af881 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -43,6 +43,11 @@ class ICPURenderpass : public IRenderpass, public IAsset
             return {};
         }
 
+        inline core::unordered_set<IAsset*> computeDependants() override
+        {
+            return {};
+        }
+
     protected:
         inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {}
         inline ~ICPURenderpass() = default;
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index 8638a4965b..628785d2ab 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -71,6 +71,11 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 			return {};
 		}
 
+		inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return {};
+		}
+
 		//
 		inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();}
 		inline SCachedCreationParams& getCachedCreationParams()
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index 46cac56ee0..ed11e7695d 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -73,6 +73,11 @@ class ICPUSampler : public ISampler, public IAsset
 		{
         return {};
 		}
+
+    inline core::unordered_set<IAsset*> computeDependants() override
+		{
+        return {};
+		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index ce03a9be54..51be7acc5a 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -81,9 +81,23 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-        return { m_defaultTransforms.buffer.get(), m_parentJointIDs.buffer.get() };
+        return computeDependantsImpl(this);
 		}
 
+    inline core::unordered_set<IAsset*> computeDependants() override
+		{
+        return computeDependantsImpl(this);
+		}
+
+  private:
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, ICPUSkeleton>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        core::unordered_set<asset_ptr_t> dependants;
+        return { self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() };
+        return dependants;
+    }
 };
 
 }
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 5abd7d1980..59286e219d 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -52,7 +52,12 @@ class IShader : public IAsset
 		
 		inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-			return { m_code.get() };
+			return computeDependantsImpl(this);
+		}
+
+		inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return computeDependantsImpl(this);
 		}
 		
 		//
@@ -101,6 +106,14 @@ class IShader : public IAsset
 		std::string m_filepathHint;
 		core::smart_refctd_ptr<ICPUBuffer> m_code;
 		E_CONTENT_TYPE m_contentType;
+
+  private:
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, IShader>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        return core::unordered_set<asset_ptr_t>{self->m_code.get()};
+    }
 };
 }
 
diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp
index a298fea491..a95074fdb7 100644
--- a/src/nbl/asset/ICPUDescriptorSet.cpp
+++ b/src/nbl/asset/ICPUDescriptorSet.cpp
@@ -108,35 +108,47 @@ core::smart_refctd_ptr<IAsset> ICPUDescriptorSet::clone(uint32_t _depth) const
 	return cp;
 }
 
-core::unordered_set<const IAsset*> ICPUDescriptorSet::computeDependants() const
-{
-	core::unordered_set<const IAsset*> dependants = { m_layout.get() };
-	for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
-	{
-		if (!m_descriptorInfos[i]) continue;
-    const auto size = m_descriptorInfos[i]->size();
-    for (auto desc_i = 0u; desc_i < size; desc_i++)
+template <typename Self>
+  requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSet>)
+static auto computeDependantsImpl(Self* self) {
+    using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+    core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get() };
+    for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
     {
-      auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get();
-			if (!desc) continue;
-      switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+      if (!self->m_descriptorInfos[i]) continue;
+      const auto size = self->m_descriptorInfos[i]->size();
+      for (auto desc_i = 0u; desc_i < size; desc_i++)
       {
-      case IDescriptor::EC_BUFFER:
-        dependants.insert(static_cast<ICPUBuffer*>(desc));
-      case IDescriptor::EC_SAMPLER:
-        dependants.insert(static_cast<ICPUSampler*>(desc));
-      case IDescriptor::EC_IMAGE:
-        dependants.insert(static_cast<ICPUImageView*>(desc));
-      case IDescriptor::EC_BUFFER_VIEW:
-        dependants.insert(static_cast<ICPUBufferView*>(desc));
-      case IDescriptor::EC_ACCELERATION_STRUCTURE:
-        dependants.insert(static_cast<ICPUTopLevelAccelerationStructure*>(desc));
-      default:
-        break;
+        auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get();
+        if (!desc) continue;
+        switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+        {
+        case IDescriptor::EC_BUFFER:
+          dependants.insert(static_cast<ICPUBuffer*>(desc));
+        case IDescriptor::EC_SAMPLER:
+          dependants.insert(static_cast<ICPUSampler*>(desc));
+        case IDescriptor::EC_IMAGE:
+          dependants.insert(static_cast<ICPUImageView*>(desc));
+        case IDescriptor::EC_BUFFER_VIEW:
+          dependants.insert(static_cast<ICPUBufferView*>(desc));
+        case IDescriptor::EC_ACCELERATION_STRUCTURE:
+          dependants.insert(static_cast<ICPUTopLevelAccelerationStructure*>(desc));
+        default:
+          break;
+        }
       }
     }
-	}
-	return dependants;
+    return dependants;
+}
+
+core::unordered_set<const IAsset*> ICPUDescriptorSet::computeDependants() const
+{
+	return computeDependantsImpl(this);
+}
+
+core::unordered_set<IAsset*> ICPUDescriptorSet::computeDependants()
+{
+	return computeDependantsImpl(this);
 }
 
 }
\ No newline at end of file

From 2ac65f64277bb9bf3c9288104e36f32e639421f2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 16 May 2025 18:26:59 +0700
Subject: [PATCH 133/346] Refactor anyDependantDiscardedContents and
 discardDependantsContents

---
 include/nbl/asset/IPreHashed.h | 56 +++++++++++++---------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h
index 4ffda209df..86e1841f61 100644
--- a/include/nbl/asset/IPreHashed.h
+++ b/include/nbl/asset/IPreHashed.h
@@ -41,36 +41,31 @@ class IPreHashed : public IAsset
 
 		static inline void discardDependantsContents(const std::span<IAsset*> roots)
 		{
-			struct stack_entry_t
-			{
-				const IAsset* asset;
-				core::unordered_set<const IAsset*> unvisitedChilds;
-			};
-			core::stack<stack_entry_t> stack;
-			core::unordered_set<const IAsset*> alreadyVisited;
-			auto push = [&stack,&alreadyVisited](const IAsset* node) -> void
+			core::stack<IAsset*> stack;
+			core::unordered_set<IAsset*> alreadyVisited; // whether we have push the node to the stack
+			core::unordered_set<IAsset*> alreadyDescended; // whether we have push the children to the stack
+			auto push = [&stack,&alreadyVisited](IAsset* node) -> void
 			{
 				if (!node)
 					return;
 				const auto [dummy,inserted] = alreadyVisited.insert(node);
 				if (inserted)
-					stack.push({ .asset = node, .unvisitedChilds = node->computeDependants()});
+					stack.push(node);
 			};
 			for (const auto& root : roots)
 				push(root);
 			while (!stack.empty())
 			{
-				auto& entry = stack.top();
-				if (entry.unvisitedChilds.size() > 0)
+				auto* entry = stack.top();
+				const auto [dummy, inserted] = alreadyDescended.insert(entry);
+				if (inserted)
 				{
-					auto dep = *entry.unvisitedChilds.begin();
-					entry.unvisitedChilds.erase(entry.unvisitedChilds.begin());
-					push(dep);
-				}
-				else
+          core::unordered_set<IAsset*> dependants = entry->computeDependants();
+					for (auto* dependant : dependants) push(dependant);
+				} else
 				{
 					// post order traversal does discard
-					auto* isPrehashed = dynamic_cast<IPreHashed*>(entry.asset);
+					auto* isPrehashed = dynamic_cast<IPreHashed*>(entry);
 					if (isPrehashed)
 						isPrehashed->discardContent();
 					stack.pop();
@@ -79,13 +74,9 @@ class IPreHashed : public IAsset
 		}
 		static inline bool anyDependantDiscardedContents(const IAsset* root)
 		{
-			struct stack_entry_t
-			{
-				const IAsset* asset;
-				core::unordered_set<const IAsset*> unvisitedChilds;
-			};
-			core::stack<stack_entry_t> stack;
-			core::unordered_set<const IAsset*> alreadyVisited;
+			core::stack<const IAsset*> stack;
+			core::unordered_set<const IAsset*> alreadyVisited; // whether we have push the node to the stack
+			core::unordered_set<const IAsset*> alreadyDescended; // whether we have push the children to the stack
 			auto push = [&stack,&alreadyVisited](const IAsset* node) -> bool
 			{
 				if (!node)
@@ -96,7 +87,7 @@ class IPreHashed : public IAsset
 					auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
 					if (isPrehashed && isPrehashed->missingContent())
 						return true;
-					stack.push({ .asset = node, .unvisitedChilds = node->computeDependants() });
+					stack.push(node);
 				}
 				return false;
 			};
@@ -104,16 +95,13 @@ class IPreHashed : public IAsset
 				return true;
 			while (!stack.empty())
 			{
-				auto& entry = stack.top();
-				auto& unvisitedChilds = entry.unvisitedChilds;
-				if (unvisitedChilds.size() > 0)
+				auto* entry = stack.top();
+				const auto [dummy, inserted] = alreadyDescended.insert(entry);
+				if (inserted)
 				{
-					auto dep = *unvisitedChilds.begin();
-					unvisitedChilds.erase(unvisitedChilds.begin());
-					if (push(dep))
-						return true;
-				}
-				else
+          core::unordered_set<const IAsset*> dependants = entry->computeDependants();
+					for (auto* dependant : dependants) push(dependant);
+				} else
 					stack.pop();
 			}
 			return false;

From 209ecf3478357de8f9b2d3d892971d9464305d34 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 19 May 2025 14:50:39 +0200
Subject: [PATCH 134/346] correct policy setup, propagate to all 3rdparty
 projects; silents some warnings and fixes an issue with bz2 error only on
 first configure run

---
 3rdparty/CMakeLists.txt |  3 +++
 CMakeLists.txt          | 22 +++++++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index ffbf8e4cbd..5bd2d6859f 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -4,6 +4,9 @@
 
 include(../cmake/common.cmake)
 
+project(Nabla-3rdparty LANGUAGES CXX C)
+enable_language(C CXX ASM ASM_NASM)
+
 option(NBL_FORCE_RELEASE_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to Release" OFF)
 option(NBL_FORCE_RELWITHDEBINFO_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to RelWithDebInfo" OFF)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a63d30a89d..f24877148b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,16 +1,24 @@
 # Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 # This file is part of the "Nabla Engine".
 # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h
-
 cmake_minimum_required(VERSION 3.31)
-# TODO: Yas - once we deploy 4.x we will fire `cmake_policy` instead of manually picking policies
+
+# TODO: Yas - once we deploy 4.x we will fire `cmake_policy(VERSION <min>[...<max>])` instead of manually picking policies
 # https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html#policy-version
 # also we should update deps which throw warnings about < 3.10 compatibility
-cmake_policy(SET CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077
-cmake_policy(SET CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077
-cmake_policy(SET CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112
-cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141
-cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118
+
+macro(NBL_POLICY P S)
+if(POLICY ${P})
+	cmake_policy(SET ${P} ${S})
+	set(CMAKE_POLICY_DEFAULT_${P} ${S})
+endif()
+endmacro()
+
+NBL_POLICY(CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0003.html#cmp0003
+NBL_POLICY(CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077
+NBL_POLICY(CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112
+NBL_POLICY(CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141
+NBL_POLICY(CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118
 
 set(NBL_BUILD_ANDROID OFF)
 

From ae27b7df8f593880432360f4796386800b2f0c59 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 19 May 2025 14:51:47 +0200
Subject: [PATCH 135/346] enforce some constraints, stop the double
 instantiation of `Triangles` and `AABBs` with `const BufferType` and
 `BufferType`

---
 include/nbl/asset/IAccelerationStructure.h    | 30 ++++++---
 include/nbl/video/IGPUAccelerationStructure.h | 20 +++---
 include/nbl/video/ILogicalDevice.h            | 66 ++++++++++++-------
 src/nbl/video/CVulkanAccelerationStructure.h  | 10 +--
 src/nbl/video/CVulkanLogicalDevice.h          | 42 ++++++------
 src/nbl/video/IGPUAccelerationStructure.cpp   |  4 +-
 src/nbl/video/utilities/CAssetConverter.cpp   | 41 ++----------
 7 files changed, 104 insertions(+), 109 deletions(-)

diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index 0efe6781ae..a29d27b828 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -88,19 +88,32 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 			NO_DUPLICATE_ANY_HIT_INVOCATION_BIT	= 0x1u<<1u,
 		};
 
+		enum class GeometryType : uint8_t
+		{
+			Triangles = 0,
+			AABBs = 1,
+			// Later: LSS and friends
+			Count = 2
+		};
+
 		// Note that in Vulkan strides are 64-bit value but restricted to be 32-bit in range
-		template<typename BufferType> requires std::is_base_of_v<IBuffer,BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<IBuffer,BufferType>)
 		struct Triangles
 		{
 			public:
-				using buffer_t = std::remove_const_t<BufferType>;
-				constexpr static inline bool Host = std::is_same_v<buffer_t,ICPUBuffer>;
+				using buffer_t = BufferType;
+				constexpr static inline GeometryType Type = GeometryType::Triangles;
+				
+			private:
+				constexpr static inline bool HostTransform = std::is_same_v<buffer_t,ICPUBuffer>;
+
+			public:
 				// we make our life easier by not taking pointers to single matrix values
-				using transform_t = std::conditional_t<Host,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
+				using transform_t = std::conditional_t<HostTransform,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
 
 				inline bool hasTransform() const
 				{
-					if constexpr (Host)
+					if constexpr (HostTransform)
 						return !core::isnan(transform[0][0]);
 					else
 						return bool(transform.buffer);
@@ -122,17 +135,18 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 			private:
 				constexpr static transform_t __transform_initializer()
 				{
-					if constexpr (Host)
+					if constexpr (HostTransform)
 						return hlsl::float32_t3x4(std::numeric_limits<float>::quiet_NaN());
 					return {};
 				}
 		};
 
 		//
-		template<typename BufferType> requires std::is_base_of_v<IBuffer,BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<IBuffer,BufferType>)
 		struct AABBs
 		{
-			using buffer_t = std::remove_const_t<BufferType>;
+			using buffer_t = BufferType;
+			constexpr static inline GeometryType Type = GeometryType::Triangles;
 
 			// for `MOTION_BIT` you don't get a second buffer for AABBs at different times because linear interpolation of AABBs doesn't work
 			asset::SBufferBinding<const BufferType>	data = {};
diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index af541bdccb..b7c1858130 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -45,7 +45,7 @@ class IGPUAccelerationStructure : public IBackendObject
 #endif
 
 		//! builds
-		template<class BufferType>
+		template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct BuildInfo
 		{
 			public:
@@ -112,7 +112,7 @@ class IGPUAccelerationStructure : public IBackendObject
 			IGPUAccelerationStructure* dst = nullptr;
 			COPY_MODE mode = COPY_MODE::CLONE;
 		};
-		template<typename BufferType>
+		template<typename BufferType>  requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct CopyToMemoryInfo
 		{
 			const IGPUAccelerationStructure* src = nullptr;
@@ -121,7 +121,7 @@ class IGPUAccelerationStructure : public IBackendObject
 		};
 		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
 		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
-		template<typename BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct CopyFromMemoryInfo
 		{
 			asset::SBufferBinding<const BufferType> src = nullptr;
@@ -181,7 +181,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*;
 		using MaxInputCounts = const uint32_t* const;
 
-		template<class BufferType>
+		template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct BuildInfo final : IGPUAccelerationStructure::BuildInfo<BufferType>
 		{
 			private:
@@ -203,7 +203,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 				NBL_API2 uint32_t valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const;
 
 				// really expensive to call, `valid` only calls it when `_NBL_DEBUG` is defined
-				inline bool validGeometry(size_t& totalPrims, const AABBs<const BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
+				inline bool validGeometry(size_t& totalPrims, const AABBs<BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
 				{
 					constexpr size_t AABBalignment = 8ull;
 					// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03659
@@ -222,7 +222,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 					totalPrims += buildRangeInfo.primitiveCount;
 					return true;
 				}
-				inline bool validGeometry(size_t& totalPrims, const Triangles<const BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
+				inline bool validGeometry(size_t& totalPrims, const Triangles<BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
 				{
 					//
 					if (!dstAS->validVertexFormat(geometry.vertexFormat))
@@ -306,7 +306,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 						*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(srcAS);
 					*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(dstAS);
 
-					if (buildFlags.hasFlags(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+					if (buildFlags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 					{
 						for (auto i=0u; i<geometryCount; i++)
 							*(oit++) = aabbs[i].data.buffer;
@@ -337,8 +337,8 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 				// please interpret based on `buildFlags.hasFlags(GEOMETRY_TYPE_IS_AABB_BIT)`
 				union
 				{
-					const Triangles<const BufferType>* triangles = nullptr;
-					const AABBs<const BufferType>* aabbs;
+					const Triangles<BufferType>* triangles = nullptr;
+					const AABBs<BufferType>* aabbs;
 				};
 		};
 		using DeviceBuildInfo = BuildInfo<IGPUBuffer>;
@@ -393,7 +393,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo*;
 		using MaxInputCounts = const uint32_t;
 
-		template<typename BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct BuildInfo final : IGPUAccelerationStructure::BuildInfo<BufferType>
 		{
 			private:
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 8ad3b839ab..b23afa2679 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -412,19 +412,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         };
         // fun fact: you can use garbage/invalid pointers/offset for the Device/Host addresses of the per-geometry data, just make sure what was supposed to be null is null
         template<class Geometry> requires nbl::is_any_of_v<Geometry,
-            IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>,
-            IGPUBottomLevelAccelerationStructure::Triangles<const asset::ICPUBuffer>,
-            IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>,
-            IGPUBottomLevelAccelerationStructure::AABBs<const asset::ICPUBuffer>
+            asset::IBottomLevelAccelerationStructure::Triangles<IGPUBuffer>,
+            asset::IBottomLevelAccelerationStructure::Triangles<asset::ICPUBuffer>,
+            asset::IBottomLevelAccelerationStructure::AABBs<IGPUBuffer>,
+            asset::IBottomLevelAccelerationStructure::AABBs<asset::ICPUBuffer>
         >
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags,
+            const bool hostBuild,
+            const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags,
             const bool motionBlur,
             const std::span<const Geometry> geometries,
             const uint32_t* const pMaxPrimitiveCounts
         ) const
         {
-            if (invalidFeaturesForASBuild<typename Geometry::buffer_t>(motionBlur))
+            if (invalidFeaturesForASBuild(hostBuild,motionBlur))
             {
                 NBL_LOG_ERROR("Required features are not enabled");
                 return {};
@@ -455,13 +456,29 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             uint32_t primsFree = limits.maxAccelerationStructurePrimitiveCount;
 			for (auto i=0u; i<geometries.size(); i++)
             {
-                if constexpr (std::is_same_v<IGPUBottomLevelAccelerationStructure::Triangles<const Geometry::buffer_t>,Geometry>)
+                const auto& geom = geometries[i];
+                if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles)
                 {
-                    // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity?
+                    if (flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+                    {
+                        NBL_LOG_ERROR("Primitive type is Triangles but build flag says BLAS build is AABBs");
+                        return {};
+                    }
+                    if (!getPhysicalDevice()->getBufferFormatUsages()[geom.vertexFormat].accelerationStructureVertex)
+                    {
+                        NBL_LOG_ERROR("Vertex Format %d not supported as Acceleration Structure Vertex Position Input on this Device",geom.vertexFormat);
+                        return {};
+                    }
+                    // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity
                 }
-                if constexpr (std::is_same_v<IGPUBottomLevelAccelerationStructure::AABBs<const Geometry::buffer_t>,Geometry>)
+                if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::AABBs)
                 {
-                    // TODO: check stride and geometry flags for validity?
+                    if (!flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+                    {
+                        NBL_LOG_ERROR("Primitive type is AABB but build flag says BLAS build is not AABBs");
+                        return {};
+                    }
+                    // TODO: check stride and geometry flags for validity
                 }
                 if (pMaxPrimitiveCounts[i] > primsFree)
                 {
@@ -471,16 +488,16 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 primsFree -= pMaxPrimitiveCounts[i];
             }
 
-            return getAccelerationStructureBuildSizes_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(
             const bool hostBuild,
-            const core::bitflag<IGPUTopLevelAccelerationStructure::BUILD_FLAGS> flags,
+            const core::bitflag<asset::ITopLevelAccelerationStructure::BUILD_FLAGS> flags,
             const bool motionBlur,
             const uint32_t maxInstanceCount
         ) const
         {
-            if (invalidFeaturesForASBuild<IGPUBuffer>(motionBlur))
+            if (invalidFeaturesForASBuild(hostBuild,motionBlur))
             {
                 NBL_LOG_ERROR("Required features are not enabled");
                 return {};
@@ -504,7 +521,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         }
         // little utility
         template<typename BufferType=IGPUBuffer>
-        inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag<IGPUTopLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur, const uint32_t maxInstanceCount) const
+        inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag<asset::ITopLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur, const uint32_t maxInstanceCount) const
         {
             return getAccelerationStructureBuildSizes(std::is_same_v<std::remove_cv_t<BufferType>,asset::ICPUBuffer>,flags,motionBlur,maxInstanceCount);
         }
@@ -1070,20 +1087,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0;
 
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::AABBs<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::AABBs<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::Triangles<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
             const bool hostBuild, const core::bitflag<IGPUTopLevelAccelerationStructure::BUILD_FLAGS> flags,
@@ -1333,8 +1350,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             return false;
         }
-        template<class BufferType>
-        bool invalidFeaturesForASBuild(const bool motionBlur) const
+        bool invalidFeaturesForASBuild(const bool hostBuild, const bool motionBlur) const
         {
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkGetAccelerationStructureBuildSizesKHR-accelerationStructure-08933
             if (!m_enabledFeatures.accelerationStructure)
@@ -1343,7 +1359,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 return true;
             }
 			// not sure of VUID
-            if (std::is_same_v<BufferType, asset::ICPUBuffer> && !m_enabledFeatures.accelerationStructureHostCommands)
+            if (hostBuild && !m_enabledFeatures.accelerationStructureHostCommands)
             {
                 NBL_LOG_ERROR("Feature `acceleration structure` host commands is not enabled");
 				return true;
diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h
index b6c06f158d..42fefaa6d1 100644
--- a/src/nbl/video/CVulkanAccelerationStructure.h
+++ b/src/nbl/video/CVulkanAccelerationStructure.h
@@ -118,7 +118,7 @@ inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerati
 // The srcAccelerationStructure, dstAccelerationStructure, and mode members of pBuildInfo are ignored. Any VkDeviceOrHostAddressKHR members of pBuildInfo are ignored by this command
 static const VkDeviceOrHostAddressConstKHR NullAddress = { 0x0ull };
 template<Buffer BufferType, bool QueryOnly=false>
-void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<const BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase)
+void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase)
 {
 	static const VkDeviceOrHostAddressConstKHR DummyNonNullAddress = { 0xdeadbeefBADC0FFEull };
 
@@ -129,7 +129,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 	outBase.geometry.triangles.vertexStride = triangles.vertexStride;
 	outBase.geometry.triangles.maxVertex = triangles.maxVertex;
 	outBase.geometry.triangles.indexType = (triangles.indexType == asset::E_INDEX_TYPE::EIT_UNKNOWN) ? VK_INDEX_TYPE_NONE_KHR : static_cast<VkIndexType>(triangles.indexType);
-	outBase.geometry.triangles.indexData = QueryOnly ? NullAddress:getVkDeviceOrHostAddress<const BufferType>(triangles.indexData);
+	outBase.geometry.triangles.indexData = triangles.indexType==asset::E_INDEX_TYPE::EIT_UNKNOWN || QueryOnly ? NullAddress:getVkDeviceOrHostAddress<const BufferType>(triangles.indexData);
 	// except that the hostAddress member of VkAccelerationStructureGeometryTrianglesDataKHR::transformData will be examined to check if it is NULL.
 	if (!triangles.hasTransform())
 		outBase.geometry.triangles.transformData = NullAddress;
@@ -145,7 +145,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 	outBase.flags = getVkGeometryFlagsFrom(triangles.geometryFlags.value);
 }
 template<Buffer BufferType, bool QueryOnly=false>
-void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<const BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion)
+void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion)
 {
 	getVkASGeometryFrom<const BufferType,QueryOnly>(triangles,outBase);
 	if (triangles.vertexData[1].buffer)
@@ -158,7 +158,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 }
 
 template<Buffer BufferType, bool QueryOnly=false>
-void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs<const BufferType>& aabbs, VkAccelerationStructureGeometryKHR& outBase)
+void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs<BufferType>& aabbs, VkAccelerationStructureGeometryKHR& outBase)
 {
 	outBase = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR,nullptr,VK_GEOMETRY_TYPE_AABBS_KHR};
 	outBase.geometry.aabbs = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_AABBS_DATA_KHR,nullptr};
@@ -221,7 +221,7 @@ inline VkAccelerationStructureBuildGeometryInfoKHR getVkASBuildGeometryInfo(cons
 	for (auto j=0u; j<info.geometryCount; j++)
 	{
 		auto& vk_geom = *(p_vk_geometry++);
-		if (info.buildFlags.hasFlags(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+		if (info.buildFlags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 			getVkASGeometryFrom(info.aabbs[j],vk_geom);
 		else
 			getVkASGeometryFrom(info.triangles[j],vk_geom,p_vertexMotion);
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 3ed5e9983a..0c5666fae5 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -135,57 +135,53 @@ class CVulkanLogicalDevice final : public ILogicalDevice
 
         // acceleration structure modifiers
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         template<class Geometry>
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl_impl_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
             const std::span<const Geometry> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const
         {
-            constexpr bool IsAABB = std::is_same_v<Geometry,IGPUBottomLevelAccelerationStructure::AABBs<const typename Geometry::buffer_t>>;
+            constexpr bool IsTriangle = Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles;
 
             core::vector<VkAccelerationStructureGeometryKHR> vk_geometries(geometries.size());
-            core::vector<VkAccelerationStructureGeometryMotionTrianglesDataNV> vk_triangleMotions(IsAABB ? 0u:geometries.size());
+            core::vector<VkAccelerationStructureGeometryMotionTrianglesDataNV> vk_triangleMotions(IsTriangle ? geometries.size():0u);
             auto outTriangleMotions = vk_triangleMotions.data();
             for (auto i=0u; i<geometries.size(); i++)
             {
-                if constexpr (IsAABB)
-                    getVkASGeometryFrom<typename Geometry::buffer_t,true>(geometries[i],vk_geometries[i]);
-                else
+                if constexpr (IsTriangle)
                     getVkASGeometryFrom<typename Geometry::buffer_t,true>(geometries[i],vk_geometries[i],outTriangleMotions);
+                else
+                    getVkASGeometryFrom<typename Geometry::buffer_t,true>(geometries[i],vk_geometries[i]);
             }
 
-            return getAccelerationStructureBuildSizes_impl_impl(
-                std::is_same_v<typename Geometry::buffer_t,asset::ICPUBuffer>,false,
-                getVkASBuildFlagsFrom<IGPUBottomLevelAccelerationStructure>(flags,motionBlur),
-                vk_geometries,pMaxPrimitiveCounts
-            );
+            return getAccelerationStructureBuildSizes_impl_impl(hostBuild,false,getVkASBuildFlagsFrom<IGPUBottomLevelAccelerationStructure>(flags,motionBlur),vk_geometries,pMaxPrimitiveCounts);
         }
 
         AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp
index b975742436..e994123616 100644
--- a/src/nbl/video/IGPUAccelerationStructure.cpp
+++ b/src/nbl/video/IGPUAccelerationStructure.cpp
@@ -5,7 +5,7 @@
 namespace nbl::video
 {
 
-template<class BufferType>
+template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 bool IGPUAccelerationStructure::BuildInfo<BufferType>::invalid(const IGPUAccelerationStructure* const src, const IGPUAccelerationStructure* const dst) const
 {
 	// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresIndirectKHR-dstAccelerationStructure-03800
@@ -61,7 +61,7 @@ bool IGPUAccelerationStructure::BuildInfo<BufferType>::invalid(const IGPUAcceler
 //extern template class IGPUAccelerationStructure::BuildInfo<asset::ICPUBuffer>;
 
 
-template<class BufferType>
+template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 template<typename T>// requires nbl::is_any_of_v<T,std::conditional_t<std::is_same_v<BufferType,IGPUBuffer>,uint32_t,IGPUBottomLevelAccelerationStructure::BuildRangeInfo>,IGPUBottomLevelAccelerationStructure::BuildRangeInfo>
 uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<BufferType>::valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const
 {
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 285a1dce1d..5d16c5bb9b 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2853,47 +2853,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						else
 						{
 							const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
-							// the code here is not pretty, but DRY-ing is of this is for later
-// TODO: ILogicalDevice needs code to query build sizes of ICPUBottomLevelAccelerationStructure geometries!
 							if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 							{
-								const auto geoms = as->getAABBGeometries();
-								if (patch.hostBuild)
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
-								}
-								else
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
-								}
-								// TODO: check if the strides need to be aligned to 4 bytes for AABBs
+								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getAABBGeometries(),pPrimitiveCounts);
 								for (const auto& geom : geoms)
 								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
 									incrementBuildSize(aabbCount*geom.stride,alignof(float));
 							}
 							else
 							{
-								const auto geoms = as->getTriangleGeometries();
-								if (patch.hostBuild)
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
-								}
-								else
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts);
-								}
+								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getTriangleGeometries(),pPrimitiveCounts);
 								for (const auto& geom : geoms)
 								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
@@ -4683,8 +4652,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					allocSizes.reserve(asCount);
 					// BLAS and TLAS specific things
 					core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> geometryRangeInfo;
-					core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles;
-					core::vector<IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> aabbs;
+					core::vector<IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> triangles;
+					core::vector<IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer>> aabbs;
 					core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> trackedBLASes;
 					if constexpr (IsTLAS)
 						trackedBLASes.reserve(asCount);
@@ -5034,7 +5003,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 										.geometryFlags = geom.geometryFlags
 									});
 								}
-								buildInfo.aabbs = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>* const&>(aabbsOffset);
+								buildInfo.aabbs = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer>* const&>(aabbsOffset);
 							}
 							else
 							{

From 292f792e65e066a1189c7991e1021e29ab9656f9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 19 May 2025 15:32:02 +0200
Subject: [PATCH 136/346] update DXC pointer (to Clang fixes merge commit)

---
 3rdparty/dxc/dxc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 4621c707ed..71f2766da9 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 4621c707ed774ab8382391f6434810ebecd37111
+Subproject commit 71f2766da918d33d34fefac270fdee983a06dd20

From b3b2b0301fc36ef8df8ee01df6de1a0080713b05 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 19 May 2025 15:51:49 +0200
Subject: [PATCH 137/346] post-merge updates, correct
 IBottomLevelAccelerationStructure::BUILD_FLAGS's initial casts

---
 include/nbl/asset/IAccelerationStructure.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index d251dd3077..eac16d8d32 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -59,11 +59,11 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 		// build flags, we don't expose flags that don't make sense for certain levels
 		enum class BUILD_FLAGS : uint16_t
 		{
-			ALLOW_UPDATE_BIT = base_build_flags_t::ALLOW_UPDATE_BIT,
-			ALLOW_COMPACTION_BIT = base_build_flags_t::ALLOW_COMPACTION_BIT,
-			PREFER_FAST_TRACE_BIT = base_build_flags_t::PREFER_FAST_TRACE_BIT,
-			PREFER_FAST_BUILD_BIT = base_build_flags_t::PREFER_FAST_BUILD_BIT,
-			LOW_MEMORY_BIT = base_build_flags_t::LOW_MEMORY_BIT,
+			ALLOW_UPDATE_BIT = static_cast<uint16_t>(base_build_flags_t::ALLOW_UPDATE_BIT),
+			ALLOW_COMPACTION_BIT = static_cast<uint16_t>(base_build_flags_t::ALLOW_COMPACTION_BIT),
+			PREFER_FAST_TRACE_BIT = static_cast<uint16_t>(base_build_flags_t::PREFER_FAST_TRACE_BIT),
+			PREFER_FAST_BUILD_BIT = static_cast<uint16_t>(base_build_flags_t::PREFER_FAST_BUILD_BIT),
+			LOW_MEMORY_BIT = static_cast<uint16_t>(base_build_flags_t::LOW_MEMORY_BIT),
 			// Synthetic flag we use to indicate that the build data are AABBs instead of triangles, we've taken away the per-geometry choice thanks to:
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-type-03792
 			GEOMETRY_TYPE_IS_AABB_BIT = 0x1u<<5u,

From 6dda1e265afd4fe32e128a71edef62c2bf2d729c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Francisco=20Jos=C3=A9=20Letterio?=
 <40742817+Fletterio@users.noreply.github.com>
Date: Mon, 19 May 2025 20:06:50 -0300
Subject: [PATCH 138/346] Add a bunch of missing `const` in
 demote_promote_writer_readers_lock.h

---
 .../nbl/system/demote_promote_writer_readers_lock.h    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/nbl/system/demote_promote_writer_readers_lock.h b/include/nbl/system/demote_promote_writer_readers_lock.h
index 6823c26c27..5447e65f3e 100644
--- a/include/nbl/system/demote_promote_writer_readers_lock.h
+++ b/include/nbl/system/demote_promote_writer_readers_lock.h
@@ -271,7 +271,7 @@ class demote_promote_writer_readers_lock_debug
 
 	struct DefaultPreemptionCheck
 	{
-		bool operator()(state_lock_value_t oldState)
+		bool operator()(const state_lock_value_t oldState)
 		{
 			return false;
 		}
@@ -361,13 +361,13 @@ class dpwr_lock_guard_base
 	/**
 	* @brief Checks whether this guard is currently locking the lock `lk`
 	*/
-	bool hasLocked(dpwr_lock_t& lk) const
+	bool hasLocked(const dpwr_lock_t& lk) const
 	{
 		return m_lock == &lk;
 	}
 
 protected:
-	dpwr_lock_guard_base(dpwr_lock_t& lk) noexcept : m_lock(&lk) {}
+	dpwr_lock_guard_base(const dpwr_lock_t& lk) noexcept : m_lock(&lk) {}
 
 	dpwr_lock_t* m_lock;
 };
@@ -385,7 +385,7 @@ class dpwr_read_lock_guard_debug : public impl::dpwr_lock_guard_base<DebugCallba
 public:
 	using dpwr_lock_t = demote_promote_writer_readers_lock_debug<DebugCallback>;
 	using dpwr_write_lock_guard_debug_t = dpwr_write_lock_guard_debug<DebugCallback>;
-	dpwr_read_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
+	dpwr_read_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
 	explicit dpwr_read_lock_guard_debug(dpwr_lock_t& lk) : dpwr_read_lock_guard_debug(lk, std::adopt_lock_t())
 	{
 		this->m_lock->read_lock();
@@ -406,7 +406,7 @@ class dpwr_write_lock_guard_debug : public impl::dpwr_lock_guard_base<DebugCallb
 public:
 	using dpwr_lock_t = demote_promote_writer_readers_lock_debug<DebugCallback>;
 	using dpwr_read_lock_guard_debug_t = dpwr_read_lock_guard_debug<DebugCallback>;
-	dpwr_write_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
+	dpwr_write_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
 	explicit dpwr_write_lock_guard_debug(dpwr_lock_t& lk) : dpwr_write_lock_guard_debug(lk, std::adopt_lock_t())
 	{
 		this->m_lock->write_lock();

From 004c95adc9a3b1a002200d059738f30aede4c3f1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 20 May 2025 12:05:48 +0700
Subject: [PATCH 139/346] fixed minor bug

---
 examples_tests                                             | 2 +-
 include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples_tests b/examples_tests
index e828dc49ef..f4af3edc1c 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e828dc49ef0a223dcbb8b4af8d722974747f29ee
+Subproject commit f4af3edc1cd8d152f6c67bd15577b2595cb2a43f
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 88ff328e05..12f65420ca 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -19,7 +19,7 @@ template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
     static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
-    static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
+    // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
     NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
 };

From 9bd76f904b05b835f4f8ea42396ac1b5419e26c3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 20 May 2025 09:26:19 +0200
Subject: [PATCH 140/346] add docker/msvc-winsdk submodule

---
 .gitmodules        | 5 ++++-
 docker/msvc-winsdk | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 160000 docker/msvc-winsdk

diff --git a/.gitmodules b/.gitmodules
index 584ff16d65..00482441de 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -117,4 +117,7 @@
 	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
 [submodule "3rdparty/glm"]
 	path = 3rdparty/glm
-	url = git@github.com:Devsh-Graphics-Programming/glm.git
\ No newline at end of file
+	url = git@github.com:Devsh-Graphics-Programming/glm.git
+[submodule "docker/msvc-winsdk"]
+	path = docker/msvc-winsdk
+	url = ../docker-nanoserver-msvc-winsdk
diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk
new file mode 160000
index 0000000000..8aa9e767ec
--- /dev/null
+++ b/docker/msvc-winsdk
@@ -0,0 +1 @@
+Subproject commit 8aa9e767ec60aa77f477ac6cf41728e997dcc950

From 0abbb21ad5f414980480e0c2f4135d631d8cc1c2 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 20 May 2025 11:19:55 +0200
Subject: [PATCH 141/346] get stuff to compile again

---
 include/nbl/asset/IAccelerationStructure.h   | 7 ++-----
 src/nbl/video/CVulkanAccelerationStructure.h | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index a29d27b828..665135f695 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -103,11 +103,8 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 			public:
 				using buffer_t = BufferType;
 				constexpr static inline GeometryType Type = GeometryType::Triangles;
-				
-			private:
-				constexpr static inline bool HostTransform = std::is_same_v<buffer_t,ICPUBuffer>;
 
-			public:
+				constexpr static inline bool HostTransform = std::is_same_v<buffer_t,ICPUBuffer>;
 				// we make our life easier by not taking pointers to single matrix values
 				using transform_t = std::conditional_t<HostTransform,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
 
@@ -146,7 +143,7 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 		struct AABBs
 		{
 			using buffer_t = BufferType;
-			constexpr static inline GeometryType Type = GeometryType::Triangles;
+			constexpr static inline GeometryType Type = GeometryType::AABBs;
 
 			// for `MOTION_BIT` you don't get a second buffer for AABBs at different times because linear interpolation of AABBs doesn't work
 			asset::SBufferBinding<const BufferType>	data = {};
diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h
index 42fefaa6d1..8041927fa2 100644
--- a/src/nbl/video/CVulkanAccelerationStructure.h
+++ b/src/nbl/video/CVulkanAccelerationStructure.h
@@ -137,7 +137,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<B
 		outBase.geometry.triangles.transformData = DummyNonNullAddress;
 	else
 	{
-		if constexpr (triangles.Host)
+		if constexpr (triangles.HostTransform)
 			outBase.geometry.triangles.transformData.hostAddress = &triangles.transform;
 		else
 			outBase.geometry.triangles.transformData = getVkDeviceOrHostAddress<const IGPUBuffer>(triangles.transform);
@@ -147,7 +147,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<B
 template<Buffer BufferType, bool QueryOnly=false>
 void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion)
 {
-	getVkASGeometryFrom<const BufferType,QueryOnly>(triangles,outBase);
+	getVkASGeometryFrom<BufferType,QueryOnly>(triangles,outBase);
 	if (triangles.vertexData[1].buffer)
 	{
 		p_vertexMotion->sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_MOTION_TRIANGLES_DATA_NV;

From ccacddbc5b2ca1bed787e38fdf50a459606e5376 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 20 May 2025 16:49:30 +0700
Subject: [PATCH 142/346] store temporaries with data accessor

---
 examples_tests                                |  2 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 40 +++++++++++--------
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/examples_tests b/examples_tests
index f4af3edc1c..44c34a8a65 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f4af3edc1cd8d152f6c67bd15577b2595cb2a43f
+Subproject commit 44c34a8a65866fb6304c12032efd08e2338c7116
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index d53bfd6000..8bfd8b0194 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -151,20 +151,21 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
-        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
         // level 0 scan
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
-            scan_local[idx] = inclusiveScan0(scan_local[idx]);
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            value = inclusiveScan0(value);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -188,23 +189,26 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
             scalar_t left;
             scratchAccessor.template get<scalar_t>(virtualSubgroupID,left);
             if (Exclusive)
             {
-                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
                 [unroll]
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                    scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
+                    value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
             }
             else
             {
                 [unroll]
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                    scan_local[idx][i] = binop(left, scan_local[idx][i]);
+                    value[i] = binop(left, value[i]);
             }
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
         }
     }
 };
@@ -303,20 +307,21 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize];
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
         // level 0 scan
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
-            scan_local[idx] = inclusiveScan0(scan_local[idx]);
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            value = inclusiveScan0(value);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
                 const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -368,23 +373,26 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+
             const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
             const scalar_t left;
             scratchAccessor.template get<scalar_t>(virtualSubgroupID, left);
             if (Exclusive)
             {
-                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
                 [unroll]
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                    scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
+                    value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
             }
             else
             {
                 [unroll]
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                    scan_local[idx][i] = binop(left, scan_local[idx][i]);
+                    value[i] = binop(left, value[i]);
             }
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]);
+            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
         }
     }
 };

From d69cd6026556b57552f2edc7ad82aa9795089591 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 20 May 2025 12:28:50 +0200
Subject: [PATCH 143/346] correct the calculation of scratch memory needed, and
 avoid deadlock (allocation failure) due to worst case fragmentation

also fix one memory freeing bug
---
 src/nbl/video/utilities/CAssetConverter.cpp | 99 ++++++++++++++-------
 1 file changed, 65 insertions(+), 34 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 5d16c5bb9b..7c325cb17d 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2828,13 +2828,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					const auto buildFlags = patch.getBuildFlags(as);
 					const auto outIx = i+entry.second.firstCopyIx;
 					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
-					// prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
+					// prevent CPU hangs by making sure allocator big enough to service us in worst case
 					const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize;
-					uint64_t buildSize = 0; uint32_t buildAlignment = 4;
-					auto incrementBuildSize = [minScratchAllocSize,&buildSize,&buildAlignment](const uint64_t size, const uint32_t alignment)->void
+					uint64_t buildSize = 0;
+					auto incrementBuildSize = [minScratchAllocSize,&buildSize](const uint64_t size, const uint32_t alignment)->void
 					{
-						buildSize = core::alignUp(buildSize,alignment)+hlsl::max<uint64_t>(size,minScratchAllocSize);
-						buildAlignment = hlsl::max(buildAlignment,alignment);
+						// account for fragmentation and misalignment
+						buildSize += hlsl::max<uint64_t>(size,minScratchAllocSize)+hlsl::max<uint32_t>(minScratchAllocSize,alignment)*2;
 					};
 					ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
 					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(entry.first.data);
@@ -2855,35 +2855,45 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
 							if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 							{
-								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getAABBGeometries(),pPrimitiveCounts);
+								const auto geoms = as->getAABBGeometries();
+								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts);
 								for (const auto& geom : geoms)
 								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
 									incrementBuildSize(aabbCount*geom.stride,alignof(float));
 							}
 							else
 							{
-								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getTriangleGeometries(),pPrimitiveCounts);
+								const auto geoms = as->getTriangleGeometries();
+								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts);
 								for (const auto& geom : geoms)
 								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
 									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
 									if (geom.hasTransform())
+									{
 										size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
-									auto alignment = 0u;
+										alignment = hlsl::max<uint16_t>(alignof(float),alignment);
+									}
+									uint16_t indexSize = 0;
 									switch (geom.indexType)
 									{
 										case E_INDEX_TYPE::EIT_16BIT:
-											alignment = alignof(uint16_t);
+											indexSize = sizeof(uint16_t);
 											break;
 										case E_INDEX_TYPE::EIT_32BIT:
-											alignment = alignof(uint32_t);
+											indexSize = sizeof(uint32_t);
 											break;
 										default:
 											break;
 									}
-									if (alignment)
-										size = core::alignUp(size,alignment)+triCount*3*alignment;
-									incrementBuildSize(size,hlsl::max(alignment,geom.vertexStride));
+									if (indexSize)
+									{
+										size = core::alignUp(size,indexSize)+triCount*3*indexSize;
+										alignment = hlsl::max<uint16_t>(indexSize,alignment);
+									}
+									inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment);
+									incrementBuildSize(size,alignment);
 								}
 							}
 						}
@@ -2896,8 +2906,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						);
 						continue;
 					}
-					// scratch gets allocated first
-					buildSize = core::alignUp(hlsl::max<uint64_t>(sizes.buildScratchSize,minScratchAllocSize),buildAlignment)+buildSize;
+					//
+					incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment);
+					inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize);
 
 					// we need to save the buffer in a side-channel for later
 					auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
@@ -4718,12 +4729,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						{
 							submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 							// queue up a deferred allocation
-							params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore());
+							if (oldAllocCount)
+								params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore());
 						}
 						else
 						{
 							// release right away
-							params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data());
+							if (oldAllocCount)
+								params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data());
 							for (const auto& info : buildInfos)
 							{
 								const auto stagingFound = findInStaging.template operator()<CPUAccelerationStructure>(info.dstAS);
@@ -4766,7 +4779,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						auto allocCount = 0;
 						auto deallocSrc = core::makeRAIIExiter([&params,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void
 							{
-								const auto beginIx = allocSizes.size()-alignments.size();
+								const auto beginIx = allocSizes.size()-allocCount;
 								// if got to end of loop queue up the release of memory, otherwise release right away
 								if (allocCount)
 									params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx);
@@ -4837,42 +4850,60 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
 									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
 									if (geom.hasTransform())
+									{
 										size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
-									auto alignment = 0u;
+										alignment = hlsl::max<uint16_t>(alignof(float),alignment);
+									}
+									uint16_t indexSize = 0u;
 									switch (geom.indexType)
 									{
 										case E_INDEX_TYPE::EIT_16BIT:
-											alignment = alignof(uint16_t);
+											indexSize = alignof(uint16_t);
 											break;
 										case E_INDEX_TYPE::EIT_32BIT:
-											alignment = alignof(uint32_t);
+											indexSize = alignof(uint32_t);
 											break;
 										default:
 											break;
 									}
-									if (alignment)
-										size = core::alignUp(size,alignment)+triCount*3*alignment;
+									if (indexSize)
+									{
+										size = core::alignUp(size,indexSize)+triCount*3*indexSize;
+										alignment = hlsl::max<uint16_t>(indexSize,alignment);
+									}
 									allocSizes.push_back(size);
-									alignments.push_back(hlsl::max(alignment,geom.vertexStride));
+									alignments.push_back(alignment);
+									const auto tmp = asToBuild.second.scratchSize;
+									logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp);
 								}
 							}
 						}
 						allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value);
 						// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
-						auto* const offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
-						const auto* const sizes = allocSizes.data()+allocSizes.size()-alignments.size();
+						auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
+						const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size();
+						logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0));
 						for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++)
-						if (t==1) // don't flush right away cause allocator not defragmented yet
 						{
-							recordBuildCommands();
-							// if writing to scratch directly, flush the writes
-							if (!flushRanges.empty())
+							if (t==1) // don't flush right away cause allocator not defragmented yet
 							{
-								device->flushMappedMemoryRanges(flushRanges);
-								flushRanges.clear();
+								recordBuildCommands();
+								// the submit overflow deallocates old offsets and erases them from the temp arrays, pointer changes
+								offsets = allocOffsets.data();
+								sizes = allocSizes.data();
+								// if writing to scratch directly, flush the writes
+								if (!flushRanges.empty())
+								{
+									device->flushMappedMemoryRanges(flushRanges);
+									flushRanges.clear();
+								}
+								drainCompute();
 							}
-							drainCompute();
+							// we may be preventing ourselves from allocating memory, with one successful allocation still being alive and fragmenting our allocator
+							params.scratchForDeviceASBuild->multi_deallocate(alignments.size(),offsets,sizes);
+							std::fill_n(offsets,alignments.size(),scratch_allocator_t::invalid_value);
 						}
 						// now upon a failure, our allocations will need to be deallocated
 						allocCount = alignments.size();
@@ -5055,7 +5086,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 									outGeom.indexType = geom.indexType;
 									outGeom.geometryFlags = geom.geometryFlags;
 								}
-								buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>* const&>(trianglesOffset);
+								buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>* const&>(trianglesOffset);
 							}
 							success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size();
 							rangeInfos.push_back(reinterpret_cast<const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const&>(geometryRangeInfoOffset));

From 5c13a932887a527d8e53d201c4d96aca84994d05 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 20 May 2025 17:41:39 +0700
Subject: [PATCH 144/346] Remove impl_valid and rework
 SSpecializatioNValidationResult

---
 include/nbl/asset/IRayTracingPipeline.h     |  21 +-
 include/nbl/video/IGPUComputePipeline.h     |  20 +-
 include/nbl/video/IGPUGraphicsPipeline.h    |  49 ++--
 include/nbl/video/IGPUPipeline.h            |  19 ++
 include/nbl/video/IGPURayTracingPipeline.h  | 234 ++++++--------------
 include/nbl/video/SPipelineCreationParams.h |  49 ++--
 6 files changed, 152 insertions(+), 240 deletions(-)

diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h
index 50ab7ba3f3..82b47f1fcb 100644
--- a/include/nbl/asset/IRayTracingPipeline.h
+++ b/include/nbl/asset/IRayTracingPipeline.h
@@ -24,10 +24,27 @@ class IRayTracingPipelineBase : public virtual core::IReferenceCounted
 template<typename PipelineLayoutType>
 class IRayTracingPipeline : public IPipeline<PipelineLayoutType>, public IRayTracingPipelineBase
 {
-    using base_creation_params_t = IPipeline<PipelineLayoutType>;
-
   public:
 
+    #define base_flag(F) static_cast<uint64_t>(IPipelineBase::FLAGS::F)
+    enum class CreationFlags : uint64_t
+    {
+      NONE = base_flag(NONE),
+      DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
+      ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
+      FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
+      EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
+      SKIP_BUILT_IN_PRIMITIVES = 1<<12,
+      SKIP_AABBS = 1<<13,
+      NO_NULL_ANY_HIT_SHADERS = 1<<14,
+      NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
+      NO_NULL_MISS_SHADERS = 1<<16,
+      NO_NULL_INTERSECTION_SHADERS = 1<<17,
+      ALLOW_MOTION = 1<<20,
+    };
+    #undef base_flag
+    using FLAGS = CreationFlags;
+
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
 
   protected:
diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 065c567ee2..2eb03cf2da 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -47,21 +47,19 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IG
 
             inline SSpecializationValidationResult valid() const
             {
-                const int32_t dataSize = shader.valid();
-                if (dataSize<0)
-                    return {};
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-stage-00701
                 if (!layout)
                     return {};
 
-                uint32_t count = 0;
-                if (shader.entries)
-                {
-                    if (shader.entries->size()>0x7fffffff)
-                        return {};
-                    count = static_cast<uint32_t>(shader.entries->size());
-                }
-                return {.count=dataSize ? count:0,.dataSize=static_cast<uint32_t>(dataSize)};
+                SSpecializationValidationResult retval = {
+                    .count = 0,
+                    .dataSize = 0,
+                };
+
+                if (!shader.accumulateSpecializationValidationResult(&retval))
+                    return {};
+
+                return retval;
             }
 
             IGPUPipelineLayout* layout = nullptr;
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index f5d6e40275..ae8924a1ab 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -32,15 +32,17 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
             };
             #undef base_flag
 
-            template<typename ExtraLambda>
-            inline bool impl_valid(ExtraLambda&& extra) const
+            inline SSpecializationValidationResult valid() const
             {
                 if (!layout)
-                    return false;
+                    return {};
+                SSpecializationValidationResult retval = {.count=0,.dataSize=0};
+                if (!layout)
+                    return {};
 
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
                 if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount())
-                    return false;
+                    return {};
 
                 // TODO: check rasterization samples, etc.
                 //rp->getCreationParameters().subpasses[i]
@@ -49,41 +51,18 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
 
                 auto processSpecInfo = [&](const SShaderSpecInfo& specInfo, hlsl::ShaderStage stage)
                 {
-                    if (!extra(specInfo)) return false;
-                    if (!specInfo.shader) return false;
+                    if (!specInfo.shader) return true;
+                    if (!specInfo.accumulateSpecializationValidationResult(&retval)) return false;
                     stagePresence != stage;
                     return true;
                 };
-                if (!processSpecInfo(vertexShader)) return false;
-                if (!processSpecInfo(tesselationControlShader)) return false;
-                if (!processSpecInfo(tesselationEvaluationShader)) return false;
-                if (!processSpecInfo(geometryShader)) return false;
-                if (!processSpecInfo(fragmentShader)) return false;
+                if (!processSpecInfo(vertexShader, hlsl::ShaderStage::ESS_VERTEX)) return {};
+                if (!processSpecInfo(tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)) return {};
+                if (!processSpecInfo(tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) return {};
+                if (!processSpecInfo(geometryShader, hlsl::ShaderStage::ESS_GEOMETRY)) return {};
+                if (!processSpecInfo(fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT)) return {};
                 
-                return hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType);
-                
-            }
-
-            inline SSpecializationValidationResult valid() const
-            {
-                if (!layout)
-                    return {};
-                SSpecializationValidationResult retval = {.count=0,.dataSize=0};
-                const bool valid = impl_valid([&retval](const SShaderSpecInfo& info)->bool
-                {
-                    const auto dataSize = info.valid();
-                    if (dataSize<0)
-                        return false;
-                    else if (dataSize==0)
-                        return true;
-
-                    const size_t count = info.entries ? info.entries->size():0x80000000ull;
-                    if (count>0x7fffffff)
-                        return {};
-                    retval += {.count=dataSize ? static_cast<uint32_t>(count):0,.dataSize=static_cast<uint32_t>(dataSize)};
-                    return retval;
-                });
-                if (!valid)
+                if (!hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType))
                     return {};
                 return retval;
             }
diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index fc4bc8d219..ff6d97f17b 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -7,6 +7,7 @@
 #define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
 
 #include "nbl/video/IGPUPipelineLayout.h"
+#include "nbl/video/SPipelineCreationParams.h"
 #include "nbl/asset/IPipeline.h"
 
 namespace nbl::video
@@ -69,6 +70,24 @@ class IGPUPipelineBase {
                 return static_cast<int32_t>(specData);
             }
 
+            bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const
+            {
+                const auto dataSize = valid();
+                if (dataSize < 0)
+                    return false;
+                if (dataSize == 0)
+                    return true;
+
+                const size_t count = entries ? entries->size() : 0x80000000ull;
+                if (count > 0x7fffffff)
+                    return false;
+                *retval += {
+                    .count = dataSize ? static_cast<uint32_t>(count) : 0,
+                    .dataSize = static_cast<uint32_t>(dataSize),
+                };
+                return *retval;
+            }
+
             const asset::IShader* shader = nullptr;
             std::string_view entryPoint = "";
 
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 2a6701c9e6..f7a92252f7 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -15,118 +15,9 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
         using pipeline_t = asset::IRayTracingPipeline<const IGPUPipelineLayout>;
 
     public:
-        struct SCreationParams
+        struct SCreationParams : public SPipelineCreationParams<const IGPURayTracingPipeline>
         {
-          #define base_flag(F) static_cast<uint64_t>(IPipelineBase::FLAGS::F)
-          enum class FLAGS : uint64_t
-          {
-              NONE = base_flag(NONE),
-              DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
-              ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
-              FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
-              EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
-              SKIP_BUILT_IN_PRIMITIVES = 1<<12,
-              SKIP_AABBS = 1<<13,
-              NO_NULL_ANY_HIT_SHADERS = 1<<14,
-              NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
-              NO_NULL_MISS_SHADERS = 1<<16,
-              NO_NULL_INTERSECTION_SHADERS = 1<<17,
-              ALLOW_MOTION = 1<<20,
-          };
-          #undef base_flag
-
-          protected:
-              template<typename ExtraLambda>
-              inline bool impl_valid(ExtraLambda&& extra) const
-              {
-                  if (!m_layout) return false;
-
-                  for (const auto info : shaders)
-                  {
-                      if (info.shader)
-                      {
-                          if (!extra(info))
-                            return false;
-                          const auto stage = info.stage;
-                          if ((stage & ~hlsl::ShaderStage::ESS_ALL_RAY_TRACING) != 0)
-                            return false;
-                          if (!std::has_single_bit<std::underlying_type_t<hlsl::ShaderStage>>(stage))
-                            return false;
-                      }
-                      else
-                      {
-                          // every shader must not be null. use SIndex::Unused to represent unused shader.
-                          return false;
-                      }
-                  }
-
-                  auto getShaderStage = [this](size_t index) -> hlsl::ShaderStage
-                  {
-                      return shaders[index].stage;
-                  };
-
-                auto isValidShaderIndex = [this, getShaderStage](size_t index, hlsl::ShaderStage expectedStage, bool is_unused_shader_forbidden) -> bool
-                  {
-                    if (index == SShaderGroupsParams::SIndex::Unused)
-                      return !is_unused_shader_forbidden;
-                    if (index >= shaders.size())
-                      return false;
-                    if (getShaderStage(index) != expectedStage)
-                      return false;
-                    return true;
-                  };
-
-                if (!isValidShaderIndex(shaderGroups.raygen.index, hlsl::ShaderStage::ESS_RAYGEN, true))
-                {
-                  return false;
-                }
-
-                for (const auto& shaderGroup : shaderGroups.hits)
-                {
-                  // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
-                  if (!isValidShaderIndex(shaderGroup.anyHit, 
-                    hlsl::ShaderStage::ESS_ANY_HIT,
-                    bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS)))
-                    return false;
-
-                  // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
-                  if (!isValidShaderIndex(shaderGroup.closestHit, 
-                    hlsl::ShaderStage::ESS_CLOSEST_HIT,
-                    bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS)))
-                    return false;
-
-                  if (!isValidShaderIndex(shaderGroup.intersection, 
-                    hlsl::ShaderStage::ESS_INTERSECTION,
-                    false))
-                    return false;
-                }
-
-                for (const auto& shaderGroup : shaderGroups.misses)
-                {
-                  if (!isValidShaderIndex(shaderGroup.index, 
-                    hlsl::ShaderStage::ESS_MISS, 
-                    false))
-                    return false;
-                }
-
-                for (const auto& shaderGroup : shaderGroups.callables)
-                {
-                  if (!isValidShaderIndex(shaderGroup.index, hlsl::ShaderStage::ESS_CALLABLE, false))
-                    return false;
-                }
-                return true;
-              }
-
-          public:
-            inline bool valid() const
-            {
-              return impl_valid([](const SShaderSpecInfo& info)->bool
-              {
-                if (!info.valid())
-                  return false;
-                return false;
-              });
-            }
+            using FLAGS = pipeline_t::FLAGS;
 
             struct SShaderGroupsParams
             {
@@ -149,50 +40,12 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
 
             };
 
+            IGPUPipelineLayout* layout = nullptr;
             SShaderGroupsParams shaderGroups;
 
             SCachedCreationParams cached = {};
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
-        };
-
-
-        struct SShaderGroupHandle
-        {
-          private:
-            uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize];
-        };
-        static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize);
-
-        struct SHitGroupStackSize
-        {
-            uint16_t closestHit;
-            uint16_t anyHit;
-            uint16_t intersection;
-        };
-
-        using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array<SGeneralShaderGroup>;
-        using SHitShaderGroupContainer = core::smart_refctd_dynamic_array<SHitShaderGroup>;
-
-        struct SCreationParams final : SPipelineCreationParams<const IGPURayTracingPipeline>
-        {
-            #define base_flag(F) static_cast<uint64_t>(IPipelineBase::CreationFlags::F)
-            enum class FLAGS : uint64_t
-            {
-                NONE = base_flag(NONE),
-                DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
-                ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
-                FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
-                EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
-                SKIP_BUILT_IN_PRIMITIVES = 1<<12,
-                SKIP_AABBS = 1<<13,
-                NO_NULL_ANY_HIT_SHADERS = 1<<14,
-                NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
-                NO_NULL_MISS_SHADERS = 1<<16,
-                NO_NULL_INTERSECTION_SHADERS = 1<<17,
-                ALLOW_MOTION = 1<<20,
-            };
-            #undef base_flag
 
             inline SSpecializationValidationResult valid() const
             {
@@ -200,31 +53,76 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                   return {};
 
                 SSpecializationValidationResult retval = {
-                    .count=0,
-                    .dataSize=0,
+                    .count = 0,
+                    .dataSize = 0,
                 };
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const SShaderSpecInfo& info)->bool
+
+                if (!shaderGroups.raygen.accumulateSpecializationValidationResult(&retval))
+                    return {};
+
+                for (const auto& shaderGroup : shaderGroups.hits)
                 {
-                    const auto dataSize = info.valid();
-                    if (dataSize<0)
-                        return false;
-                    else if (dataSize==0)
-                        return true;
-
-                    const size_t count = info.entries ? info.entries->size():0x80000000ull;
-                    if (count>0x7fffffff)
+                    if (shaderGroup.intersection.shader) 
+                    {
+                      if (!shaderGroup.intersection.accumulateSpecializationValidationResult(&retval))
                         return {};
-                    retval += {.count=dataSize ? static_cast<uint32_t>(count):0,.dataSize=static_cast<uint32_t>(dataSize)};
-                    return retval;
-                });
-                if (!valid)
-                    return {};
+                    }
+
+                    if (shaderGroup.closestHit.shader) 
+                    {
+                      if (!shaderGroup.closestHit.accumulateSpecializationValidationResult(&retval))
+                        return {};
+                    }
+
+                    // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
+                    if (flags & FLAGS::NO_NULL_ANY_HIT_SHADERS && !shaderGroup.anyHit.shader)
+                        return {};
+
+                    if (shaderGroup.anyHit.shader) 
+                    {
+                      if (!shaderGroup.anyHit.accumulateSpecializationValidationResult(&retval))
+                        return {};
+                    }
+
+                    // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
+                    if (flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS && !shaderGroup.intersection.shader)
+                        return {};
+                }
+
+                for (const auto& miss : shaderGroups.misses)
+                {
+                  if (miss.shader) 
+                  {
+                    if (!miss.accumulateSpecializationValidationResult(&retval))
+                      return {};
+                  }
+                }
+
+                for (const auto& callable : shaderGroups.callables)
+                {
+                  if (callable.shader)
+                  {
+                    if (!callable.accumulateSpecializationValidationResult(&retval))
+                      return {};
+                  }
+                }
+
                 return retval;
             }
+        };
 
-            inline std::span<const SShaderSpecInfo> getShaders() const { return shaders; }
+        struct SShaderGroupHandle
+        {
+          private:
+            uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize];
+        };
+        static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize);
 
-            IGPUPipelineLayout* layout = nullptr;
+        struct SHitGroupStackSize
+        {
+            uint16_t closestHit;
+            uint16_t anyHit;
+            uint16_t intersection;
         };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const { return m_flags; }
diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h
index 489bff4343..3a25560ae4 100644
--- a/include/nbl/video/SPipelineCreationParams.h
+++ b/include/nbl/video/SPipelineCreationParams.h
@@ -11,6 +11,31 @@
 namespace nbl::video
 {
 
+struct SSpecializationValidationResult
+{
+  constexpr static inline uint32_t Invalid = ~0u;
+  inline operator bool() const
+  {
+    return count!=Invalid && dataSize!=Invalid;
+  }
+
+  inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other)
+  {
+    // TODO: check for overflow before adding
+    if (*this && other)
+    {
+      count += other.count;
+      dataSize += other.dataSize;
+    }
+    else
+      *this = {};
+    return *this;
+  }
+
+  uint32_t count = Invalid;
+  uint32_t dataSize = Invalid;
+};
+
 // For now, due to API design we implicitly satisfy:
 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-08771
 // to:
@@ -18,30 +43,6 @@ namespace nbl::video
 template<typename PipelineType>
 struct SPipelineCreationParams
 {
-	struct SSpecializationValidationResult
-	{
-		constexpr static inline uint32_t Invalid = ~0u;
-		inline operator bool() const
-		{
-			return count!=Invalid && dataSize!=Invalid;
-		}
-
-		inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other)
-		{
-			// TODO: check for overflow before adding
-			if (*this && other)
-			{
-				count += other.count;
-				dataSize += other.dataSize;
-			}
-			else
-				*this = {};
-			return *this;
-		}
-
-		uint32_t count = Invalid;
-		uint32_t dataSize = Invalid;
-	};
 	constexpr static inline int32_t NotDerivingFromPreviousPipeline = -1;
 
 	inline bool isDerivative() const

From 483a788162180baef274e5e8afab4ef1f922893e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 20 May 2025 14:31:44 +0200
Subject: [PATCH 145/346] add docker/msvc-winsdk submodule, update build
 presets & dxc/CMakeLists.txt

---
 3rdparty/dxc/CMakeLists.txt | 26 ++++++-----
 CMakePresets.json           | 88 +++----------------------------------
 docker/msvc-winsdk          |  2 +-
 3 files changed, 22 insertions(+), 94 deletions(-)

diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt
index ed2528c922..9432b4df07 100644
--- a/3rdparty/dxc/CMakeLists.txt
+++ b/3rdparty/dxc/CMakeLists.txt
@@ -41,6 +41,7 @@ list(APPEND NBL_DXC_CMAKE_OPTIONS "-DSPIRV_SKIP_EXECUTABLES:BOOL=ON")
 list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_ENABLE_DEBUG_ITERATORS:BOOL=ON")
 list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_TOOLS_DIR=${DXC_SPIRV_TOOLS_DIR}")
 list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_HEADERS_DIR=${DXC_SPIRV_HEADERS_DIR}")
+list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_ENABLE_ETW=OFF")
 
 if(NOT NBL_IS_MULTI_CONFIG)
 	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
@@ -85,18 +86,23 @@ endif()
 
 set(DXC_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" CACHE INTERNAL "")
 
-if(MSVC AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja Multi-Config" AND NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
-	execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" "-Ax64" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS}
-		RESULT_VARIABLE DXC_CMAKE_RESULT
-		OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE
-	)
-else()
-	execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS}
-		RESULT_VARIABLE DXC_CMAKE_RESULT
-		OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE
-	)
+if(NOT CMAKE_GENERATOR MATCHES "Ninja*")
+    list(APPEND NBL_DXC_CMAKE_OPTIONS -Ax64)
+endif()
+
+if(CMAKE_GENERATOR_TOOLSET)
+	list(APPEND NBL_DXC_CMAKE_OPTIONS -T "${CMAKE_GENERATOR_TOOLSET}")
 endif()
 
+if(CMAKE_TOOLCHAIN_FILE)
+	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" ${NBL_DXC_CMAKE_OPTIONS}
+	RESULT_VARIABLE DXC_CMAKE_RESULT
+	OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE
+)
+
 if(NOT "${DXC_CMAKE_RESULT}" STREQUAL "0")
 	message(FATAL_ERROR "${DXC_CMAKE_STREAM_PIPE}")
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index da28fc1aff..032d9ad45e 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -2,8 +2,8 @@
 	"version": 6,
 	"cmakeMinimumRequired": {
 		"major": 3,
-		"minor": 29,
-		"patch": 2
+		"minor": 31,
+		"patch": 0
 	},
 	"configurePresets": [
 		{
@@ -90,37 +90,17 @@
 		{
 			"name": "ci-configure-static-msvc",
 			"inherits": "ci-configure-static-windows-base",
-			"displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Configure as static library with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"generator": "Visual Studio 17 2022",
-			"toolset": "v143"
-		},
-		{
-			"name": "ci-configure-dynamic-msvc",
-			"inherits": "ci-configure-dynamic-windows-base",
-			"displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Configure as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"generator": "Visual Studio 17 2022",
-			"toolset": "v143"
-		},
-		{
-			"name": "ci-configure-static-ninja-multi",
-			"inherits": "ci-configure-static-windows-base",
-			"displayName": "[CI]: Static library target, Ninja multi-config generator",
-			"description": "Configure as static library with Ninja multi-config generator",
 			"generator": "Ninja Multi-Config",
 			"cacheVariables": {
-				"CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake"
 			}
 		},
 		{
-			"name": "ci-configure-dynamic-ninja-multi",
+			"name": "ci-configure-dynamic-msvc",
 			"inherits": "ci-configure-dynamic-windows-base",
-			"displayName": "[CI]: Dynamic library target, Ninja multi-config generator",
-			"description": "Configure as dynamic library with Ninja multi-config generator",
 			"generator": "Ninja Multi-Config",
 			"cacheVariables": {
-				"CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake"
 			}
 		},
 		{
@@ -319,8 +299,6 @@
 			"configurePreset": "ci-configure-static-msvc",
 			"inheritConfigureEnvironment": true,
 			"inherits": "build-windows-base",
-			"displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Build Nabla as static library with Visual Studio 17 2022 generator and MSVC v143 toolset",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -332,34 +310,6 @@
 			"configurePreset": "ci-configure-dynamic-msvc",
 			"inheritConfigureEnvironment": true,
 			"inherits": "build-windows-base",
-			"displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Build Nabla as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"condition": {
-				"type": "equals",
-				"lhs": "$env{NBL_CI_MODE}",
-				"rhs": "ON"
-			}
-		},
-		{
-			"name": "ci-build-static-ninja-multi",
-			"configurePreset": "ci-configure-static-ninja-multi",
-			"inheritConfigureEnvironment": true,
-			"inherits": "build-windows-base",
-			"displayName": "[CI]: Static library target, Ninja multi-config generator",
-			"description": "Build Nabla as static library with Ninja multi-config generator",
-			"condition": {
-				"type": "equals",
-				"lhs": "$env{NBL_CI_MODE}",
-				"rhs": "ON"
-			}
-		},
-		{
-			"name": "ci-build-dynamic-ninja-multi",
-			"configurePreset": "ci-configure-dynamic-ninja-multi",
-			"inheritConfigureEnvironment": true,
-			"inherits": "build-windows-base",
-			"displayName": "[CI]: Dynamic library target, Ninja multi-config generator",
-			"description": "Build Nabla as dynamic library with Ninja multi-config generator",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -466,8 +416,6 @@
 			"name": "ci-package-static-msvc",
 			"inherits": "ci-package-windows-base",
 			"configurePreset": "ci-configure-static-msvc",
-			"displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Package Nabla as static library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -478,32 +426,6 @@
 			"name": "ci-package-dynamic-msvc",
 			"inherits": "ci-package-windows-base",
 			"configurePreset": "ci-configure-dynamic-msvc",
-			"displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Package Nabla as dynamic library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"condition": {
-				"type": "equals",
-				"lhs": "$env{NBL_CI_MODE}",
-				"rhs": "ON"
-			}
-		},
-		{
-			"name": "ci-package-static-ninja-multi",
-			"inherits": "ci-package-windows-base",
-			"configurePreset": "ci-configure-static-ninja-multi",
-			"displayName": "[CI]: Static library target, Ninja multi-config generator",
-			"description": "Package Nabla as static library compiled with Ninja multi-config generator",
-			"condition": {
-				"type": "equals",
-				"lhs": "$env{NBL_CI_MODE}",
-				"rhs": "ON"
-			}
-		},
-		{
-			"name": "ci-package-dynamic-ninja-multi",
-			"inherits": "ci-package-windows-base",
-			"configurePreset": "ci-configure-dynamic-ninja-multi",
-			"displayName": "[CI]: Dynamic library target, Ninja multi-config generator",
-			"description": "Package Nabla as dynamic library compiled with Ninja multi-config generator",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk
index 8aa9e767ec..8aa6a18115 160000
--- a/docker/msvc-winsdk
+++ b/docker/msvc-winsdk
@@ -1 +1 @@
-Subproject commit 8aa9e767ec60aa77f477ac6cf41728e997dcc950
+Subproject commit 8aa6a1811528e82982b2f462515ff9a0e2947e72

From fe75f42223a3b182f66390d2919a60797f430f13 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 20 May 2025 21:19:44 +0200
Subject: [PATCH 146/346] update docker/msvc-winsdk submodule, minor .env file
 change

---
 docker/.env           | 2 --
 docker/ci-windows.env | 2 ++
 docker/msvc-winsdk    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 delete mode 100644 docker/.env
 create mode 100644 docker/ci-windows.env

diff --git a/docker/.env b/docker/.env
deleted file mode 100644
index 623184f422..0000000000
--- a/docker/.env
+++ /dev/null
@@ -1,2 +0,0 @@
-THIS_PROJECT_WORKING_DIRECTORY=C:\docker
-THIS_PROJECT_NABLA_DIRECTORY=C:/Users/ContainerAdministrator/Nabla/bind
\ No newline at end of file
diff --git a/docker/ci-windows.env b/docker/ci-windows.env
new file mode 100644
index 0000000000..ea89ce43c7
--- /dev/null
+++ b/docker/ci-windows.env
@@ -0,0 +1,2 @@
+NBL_CI_MODE=ON
+NBL_CI_BUILD_DIRECTORY=C:\mount\nabla\build-ct
\ No newline at end of file
diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk
index 8aa6a18115..831515f599 160000
--- a/docker/msvc-winsdk
+++ b/docker/msvc-winsdk
@@ -1 +1 @@
-Subproject commit 8aa6a1811528e82982b2f462515ff9a0e2947e72
+Subproject commit 831515f59919fbe97653804a5fc634aeb36d360e

From 6eecd13616fb95596754b3aad1e4629ad6c4eaa2 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 20 May 2025 21:58:23 +0200
Subject: [PATCH 147/346] update CMakePresets.json, add docker/ninja.env

---
 CMakePresets.json | 3 ++-
 docker/ninja.env  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 docker/ninja.env

diff --git a/CMakePresets.json b/CMakePresets.json
index 032d9ad45e..c6396e4154 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -20,7 +20,8 @@
 				"NBL_EXPLICIT_MODULE_LOAD_LOG": "ON",
 				"NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON",
 				"NBL_RUN_TESTS": "ON",
-				"NBL_CPACK_CI": "ON"
+				"NBL_CPACK_CI": "ON",
+				"GIT_FAIL_IF_NONZERO_EXIT": "OFF"
 			}
 		},
 		{
diff --git a/docker/ninja.env b/docker/ninja.env
new file mode 100644
index 0000000000..9c6e70104c
--- /dev/null
+++ b/docker/ninja.env
@@ -0,0 +1 @@
+NINJA_STATUS="[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: "
\ No newline at end of file

From 9c596770659f4f8c2c6247c9c56cfbc57c311227 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 21 May 2025 10:57:48 +0700
Subject: [PATCH 148/346] minor fixes

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 8bfd8b0194..9744798c6f 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -121,7 +121,9 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
-            scratchAccessor.template set<scalar_t>(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+
+            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+                scratchAccessor.template set<scalar_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -130,7 +132,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             scalar_t reduce_val;
-            scratchAccessor.template get<scalar_t>(glsl::gl_SubgroupInvocationID(),reduce_val);
+            scratchAccessor.template get<scalar_t>(0,reduce_val);
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote<vector_lv0_t>(reduce_val));
         }
     }
@@ -179,9 +181,9 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]);
-            vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv1_t>(BinOp::identity), lv1_val, bool(invocationIndex));
-            shiftedInput = inclusiveScan1(shiftedInput);
-            scratchAccessor.template set<scalar_t>(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]);
+            lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
+            lv1_val = inclusiveScan1(lv1_val);
+            scratchAccessor.template set<scalar_t>(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -284,7 +286,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             scalar_t reduce_val;
-            scratchAccessor.template get<scalar_t>(glsl::gl_SubgroupInvocationID(),reduce_val);
+            scratchAccessor.template get<scalar_t>(0,reduce_val);
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
         }
     }
@@ -353,8 +355,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template get<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
-            vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(invocationIndex));
-            shiftedInput = inclusiveScan2(shiftedInput);
+            lv2_val[0] = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val[0], bool(invocationIndex));
+            vector_lv2_t shiftedScan = inclusiveScan2(lv2_val);
 
             // combine with level 1, only last element of each
             [unroll]
@@ -363,7 +365,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 scalar_t last_val;
                 scratchAccessor.template get<scalar_t>((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val);
                 scalar_t val = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(i));
-                val = binop(last_val, shiftedInput[Config::ItemsPerInvocation_2-1]);
+                val = binop(last_val, shiftedScan[Config::ItemsPerInvocation_2-1]);
                 scratchAccessor.template set<scalar_t>((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val);
             }
         }

From eb442624fbd1c2b1f9e8b38b73714f107a0eead7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 21 May 2025 13:55:17 +0700
Subject: [PATCH 149/346] moved indexing functionality to config struct

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 10 ++++++++
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 24 +++++++++----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 12f65420ca..5263a3fec8 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -75,6 +75,16 @@ struct ArithmeticConfiguration
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value<LevelCount==1,uint16_t,0,conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupSize*ItemsPerInvocation_1>::value;
+
+    static uint32_t virtualSubgroupID(const uint32_t id, const uint32_t offset)
+    {
+        return offset * (WorkgroupSize >> SubgroupSizeLog2) + id;
+    }
+
+    static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation)
+    {
+        return (id & (itemsPerInvocation-1)) * SubgroupsPerVirtualWorkgroup + (id/itemsPerInvocation);
+    }
 };
 
 template<class T>
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 9744798c6f..af4fb7f44d 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -105,8 +105,8 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             scan_local = reduction0(scan_local);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
-                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
+                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -165,8 +165,8 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
-                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
+                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -194,7 +194,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
             scalar_t left;
             scratchAccessor.template get<scalar_t>(virtualSubgroupID,left);
             if (Exclusive)
@@ -244,8 +244,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             scan_local = reduction0(scan_local);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
-                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
+                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -262,7 +262,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             lv1_val = reduction1(lv1_val);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
-                const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
                 scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -321,8 +321,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
-                const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-                const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
+                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -340,7 +340,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             lv1_val = inclusiveScan1(lv1_val);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
-                const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
                 scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -378,7 +378,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
+            const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);   // idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
             const scalar_t left;
             scratchAccessor.template get<scalar_t>(virtualSubgroupID, left);
             if (Exclusive)

From 573ce446790c3d56e71c1783668ddc3d75d1c2f1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 21 May 2025 15:02:00 +0700
Subject: [PATCH 150/346] reduction returns value instead of saving directly to
 storage

---
 examples_tests                                |  2 +-
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   |  8 ++--
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 43 +++++++++++--------
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/examples_tests b/examples_tests
index 44c34a8a65..0ccd26fc93 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 44c34a8a65866fb6304c12032efd08e2338c7116
+Subproject commit 0ccd26fc93d22587219b12291f855929949cef74
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index d0a26cdf94..e4a71bdffc 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -22,11 +22,13 @@ namespace workgroup2
 template<class Config, class BinOp, class device_capabilities=void>
 struct reduction
 {
-    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
-    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    using scalar_t = typename BinOp::type_t;
+
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<ReadOnlyDataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
+    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
-        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+        return fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index af4fb7f44d..7a4d4764f4 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -36,7 +36,7 @@ struct reduce<Config, BinOp, 1, device_capabilities>
     // doesn't use scratch smem, need as param?
 
     template<class DataAccessor, class ScratchAccessor>
-    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
@@ -45,7 +45,8 @@ struct reduce<Config, BinOp, 1, device_capabilities>
         vector_t value;
         dataAccessor.template get<vector_t>(workgroup::SubgroupContiguousIndex(), value);
         value = reduction(value);
-        dataAccessor.template set<vector_t>(workgroup::SubgroupContiguousIndex(), value);
+        return value[0];
+        // dataAccessor.template set<vector_t>(workgroup::SubgroupContiguousIndex(), value);
     }
 };
 
@@ -87,7 +88,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
     using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
     template<class DataAccessor, class ScratchAccessor>
-    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
@@ -128,13 +129,16 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // set as last element in scan (reduction)
-        [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        {
-            scalar_t reduce_val;
-            scratchAccessor.template get<scalar_t>(0,reduce_val);
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote<vector_lv0_t>(reduce_val));
-        }
+        // [unroll]
+        // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        // {
+        //     scalar_t reduce_val;
+        //     scratchAccessor.template get<scalar_t>(0,reduce_val);
+        //     dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote<vector_lv0_t>(reduce_val));
+        // }
+        scalar_t reduce_val;
+        scratchAccessor.template get<scalar_t>(0,reduce_val);
+        return reduce_val;
     }
 };
 
@@ -225,7 +229,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
     using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
 
     template<class DataAccessor, class ScratchAccessor>
-    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
         using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
@@ -282,13 +286,16 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // set as last element in scan (reduction)
-        [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        {
-            scalar_t reduce_val;
-            scratchAccessor.template get<scalar_t>(0,reduce_val);
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
-        }
+        // [unroll]
+        // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        // {
+        //     scalar_t reduce_val;
+        //     scratchAccessor.template get<scalar_t>(0,reduce_val);
+        //     dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
+        // }
+        scalar_t reduce_val;
+        scratchAccessor.template get<scalar_t>(0,reduce_val);
+        return reduce_val;
     }
 };
 

From 487c3deb108e145652d0c374ce7ff44c67a0d3ff Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 21 May 2025 10:52:22 +0200
Subject: [PATCH 151/346] Create .github/workflows/build-nabla.yml, update
 CMakePresets.json for ClangCL vendor, leave minor comments to top
 CMakeLists.txt

---
 .github/workflows/build-nabla.yml | 98 +++++++++++++++++++++++++++++++
 CMakeLists.txt                    |  4 +-
 CMakePresets.json                 | 40 ++++++++++++-
 docker/ninja.env                  |  2 +-
 4 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/build-nabla.yml

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
new file mode 100644
index 0000000000..967953aeef
--- /dev/null
+++ b/.github/workflows/build-nabla.yml
@@ -0,0 +1,98 @@
+name: Build Nabla Workflow
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build-windows:
+    runs-on: windows-2022
+
+    env:
+      image: ghcr.io/devsh-graphics-programming/docker-nanoserver-msvc-winsdk
+      entry: pwsh.exe
+      cmd: -NoLogo -NoProfile -ExecutionPolicy Bypass
+      mount: C:\mount\nabla
+      binary: C:\mount\nabla\build-ct
+      install: C:\mount\nabla\build-ct\install
+
+    strategy:
+      fail-fast: false
+      matrix:
+        vendor: [msvc, clangcl]
+        config: [Release, Debug, RelWithDebInfo]
+        tag: ['17.13.6']
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Set prefix
+        id: set-prefix
+        run: |
+          echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> $GITHUB_OUTPUT
+
+      - name: Pull Image
+        run: docker pull "${{ env.image }}:${{ matrix.tag }}"
+
+      - name: Run Container
+        run: |
+          docker run \
+            --entrypoint ${{ env.entry }} -di --isolation process \
+            --env-file .\docker\ci-windows.env \
+            --env-file .\docker\ninja.env \
+            --name orphan \
+            -v "${{ github.workspace }}:${{ env.mount }}" \
+            -w "${{ env.mount }}" \
+            "${{ env.image }}:${{ matrix.tag }}" \
+            ${{ env.cmd }}
+
+      - name: Inspect Container
+        run: docker inspect orphan
+
+      - name: Container -- Configure Project with CMake
+        run: |
+          docker exec orphan \
+            ${{ env.entry }} ${{ env.cmd }} cmake \
+              --preset ci-configure-dynamic-${{ matrix.vendor }} \
+              --profiling-output=profiling/cmake-profiling.json \
+              --profiling-format=google-trace
+
+      - name: Container -- Build NSC
+        run: |
+          docker exec orphan \
+            ${{ env.entry }} ${{ env.cmd }} cmake --build \
+              --preset ci-build-dynamic-${{ matrix.vendor }} \
+              -t nsc --config ${{ matrix.config }}
+
+      - name: Container -- Install NSC
+        run: |
+          docker exec orphan \
+            ${{ env.entry }} ${{ env.cmd }} cmake --install \
+              ${{ env.binary }} --config ${{ matrix.config }} \
+              --component Runtimes --prefix ${{ env.install }}
+
+          docker exec orphan \
+            ${{ env.entry }} ${{ env.cmd }} cmake --install \
+              ${{ env.binary }} --config ${{ matrix.config }} \
+              --component Executables --prefix ${{ env.install }}
+
+      - name: Package workflow artifacts
+        run: |
+          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" -C profiling .
+          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" -C ${{ env.install }} .
+
+      - name: Upload profiling artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.set-prefix.outputs.prefix }}-profiling
+          path: ${{ steps.set-prefix.outputs.prefix }}-profiling.tar
+
+      - name: Upload install artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.set-prefix.outputs.prefix }}-install
+          path: ${{ steps.set-prefix.outputs.prefix }}-install.tar
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f24877148b..c6664f8085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+# Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 # This file is part of the "Nabla Engine".
 # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h
 cmake_minimum_required(VERSION 3.31)
@@ -33,7 +33,9 @@ if(MSVC)
 	link_libraries(delayimp)
 endif()
 
+# TODO: TO BE KILLED, keep both in one tree
 option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared
+
 option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON)
 option(NBL_SANITIZE_ADDRESS OFF)
 
diff --git a/CMakePresets.json b/CMakePresets.json
index c6396e4154..ad3ae50b6d 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -10,7 +10,6 @@
 			"name": "ci-configure-base",
 			"hidden": true,
 			"cacheVariables": {
-				"NBL_CI_MODE": "ON",
 				"NBL_UPDATE_GIT_SUBMODULE": "OFF",
 				"NBL_COMPILE_WITH_CUDA": "OFF",
 				"NBL_BUILD_OPTIX": "OFF",
@@ -19,7 +18,6 @@
 				"_NBL_COMPILE_WITH_OPEN_EXR_": "ON",
 				"NBL_EXPLICIT_MODULE_LOAD_LOG": "ON",
 				"NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON",
-				"NBL_RUN_TESTS": "ON",
 				"NBL_CPACK_CI": "ON",
 				"GIT_FAIL_IF_NONZERO_EXIT": "OFF"
 			}
@@ -104,6 +102,22 @@
 				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake"
 			}
 		},
+		{
+			"name": "ci-configure-static-clangcl",
+			"inherits": "ci-configure-static-windows-base",
+			"generator": "Ninja Multi-Config",
+			"cacheVariables": {
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake"
+			}
+		},
+		{
+			"name": "ci-configure-dynamic-clangcl",
+			"inherits": "ci-configure-dynamic-windows-base",
+			"generator": "Ninja Multi-Config",
+			"cacheVariables": {
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake"
+			}
+		},
 		{
 			"name": "user-configure-base",
 			"hidden": true,
@@ -316,6 +330,28 @@
 				"lhs": "$env{NBL_CI_MODE}",
 				"rhs": "ON"
 			}
+		},
+				{
+			"name": "ci-build-static-clangcl",
+			"configurePreset": "ci-configure-static-clangcl",
+			"inheritConfigureEnvironment": true,
+			"inherits": "build-windows-base",
+			"condition": {
+				"type": "equals",
+				"lhs": "$env{NBL_CI_MODE}",
+				"rhs": "ON"
+			}
+		},
+		{
+			"name": "ci-build-dynamic-clangcl",
+			"configurePreset": "ci-configure-dynamic-clangcl",
+			"inheritConfigureEnvironment": true,
+			"inherits": "build-windows-base",
+			"condition": {
+				"type": "equals",
+				"lhs": "$env{NBL_CI_MODE}",
+				"rhs": "ON"
+			}
 		},
 		{
 			"name": "user-build-static-msvc",
diff --git a/docker/ninja.env b/docker/ninja.env
index 9c6e70104c..6d52cbd701 100644
--- a/docker/ninja.env
+++ b/docker/ninja.env
@@ -1 +1 @@
-NINJA_STATUS="[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: "
\ No newline at end of file
+NINJA_STATUS=[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: 
\ No newline at end of file

From 473cdcd3e3c75d98ecfa783899fd71d61d03a4a3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Wed, 21 May 2025 11:03:41 +0200
Subject: [PATCH 152/346] Update build-nabla.yml, adjust CLI escape chars to
 pwsh

---
 .github/workflows/build-nabla.yml | 57 ++++++++++++++++---------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 967953aeef..691b28f316 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -36,48 +36,49 @@ jobs:
           echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> $GITHUB_OUTPUT
 
       - name: Pull Image
-        run: docker pull "${{ env.image }}:${{ matrix.tag }}"
+        run: |
+          docker pull "${{ env.image }}:${{ matrix.tag }}"
 
       - name: Run Container
         run: |
-          docker run \
-            --entrypoint ${{ env.entry }} -di --isolation process \
-            --env-file .\docker\ci-windows.env \
-            --env-file .\docker\ninja.env \
-            --name orphan \
-            -v "${{ github.workspace }}:${{ env.mount }}" \
-            -w "${{ env.mount }}" \
-            "${{ env.image }}:${{ matrix.tag }}" \
+          docker run `
+            --entrypoint ${{ env.entry }} -di --isolation process `
+            --env-file .\docker\ci-windows.env `
+            --env-file .\docker\ninja.env `
+            --name orphan `
+            -v "${{ github.workspace }}:${{ env.mount }}" `
+            -w "${{ env.mount }}" `
+            "${{ env.image }}:${{ matrix.tag }}" `
             ${{ env.cmd }}
 
       - name: Inspect Container
-        run: docker inspect orphan
+        run: |
+          docker inspect orphan
 
-      - name: Container -- Configure Project with CMake
+      - name: Container – Configure Project with CMake
         run: |
-          docker exec orphan \
-            ${{ env.entry }} ${{ env.cmd }} cmake \
-              --preset ci-configure-dynamic-${{ matrix.vendor }} \
-              --profiling-output=profiling/cmake-profiling.json \
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} cmake `
+              --preset ci-configure-dynamic-${{ matrix.vendor }} `
+              --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
 
-      - name: Container -- Build NSC
+      - name: Container – Build NSC
         run: |
-          docker exec orphan \
-            ${{ env.entry }} ${{ env.cmd }} cmake --build \
-              --preset ci-build-dynamic-${{ matrix.vendor }} \
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} cmake --build `
+              --preset ci-build-dynamic-${{ matrix.vendor }} `
               -t nsc --config ${{ matrix.config }}
 
-      - name: Container -- Install NSC
+      - name: Container – Install NSC
         run: |
-          docker exec orphan \
-            ${{ env.entry }} ${{ env.cmd }} cmake --install \
-              ${{ env.binary }} --config ${{ matrix.config }} \
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} cmake --install `
+              ${{ env.binary }} --config ${{ matrix.config }} `
               --component Runtimes --prefix ${{ env.install }}
-
-          docker exec orphan \
-            ${{ env.entry }} ${{ env.cmd }} cmake --install \
-              ${{ env.binary }} --config ${{ matrix.config }} \
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} cmake --install `
+              ${{ env.binary }} --config ${{ matrix.config }} `
               --component Executables --prefix ${{ env.install }}
 
       - name: Package workflow artifacts
@@ -95,4 +96,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.set-prefix.outputs.prefix }}-install
-          path: ${{ steps.set-prefix.outputs.prefix }}-install.tar
\ No newline at end of file
+          path: ${{ steps.set-prefix.outputs.prefix }}-install.tar

From a1a7b6a07d45f5baa9020960af83a2a92e947ecc Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Wed, 21 May 2025 11:22:56 +0200
Subject: [PATCH 153/346] Update build-nabla.yml, add unpack packages step,
 correct container CLI steps

---
 .github/workflows/build-nabla.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 691b28f316..649c00d441 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -55,10 +55,15 @@ jobs:
         run: |
           docker inspect orphan
 
+      - name: Container – Unpack Packages
+        run: |
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} C:\unpack.ps1
+
       - name: Container – Configure Project with CMake
         run: |
           docker exec orphan `
-            ${{ env.entry }} ${{ env.cmd }} cmake `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
@@ -66,18 +71,18 @@ jobs:
       - name: Container – Build NSC
         run: |
           docker exec orphan `
-            ${{ env.entry }} ${{ env.cmd }} cmake --build `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake --build `
               --preset ci-build-dynamic-${{ matrix.vendor }} `
               -t nsc --config ${{ matrix.config }}
 
       - name: Container – Install NSC
         run: |
           docker exec orphan `
-            ${{ env.entry }} ${{ env.cmd }} cmake --install `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake --install `
               ${{ env.binary }} --config ${{ matrix.config }} `
               --component Runtimes --prefix ${{ env.install }}
           docker exec orphan `
-            ${{ env.entry }} ${{ env.cmd }} cmake --install `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake --install `
               ${{ env.binary }} --config ${{ matrix.config }} `
               --component Executables --prefix ${{ env.install }}
 

From 23d18b61a7c943ba63d175a3c2508a53d4029210 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Wed, 21 May 2025 11:32:10 +0200
Subject: [PATCH 154/346] Update build-nabla.yml, recurse submodules, use
 profiling directory

---
 .github/workflows/build-nabla.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 649c00d441..bd50b79499 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
         with:
-          submodules: 'true'
+          submodules: 'recursive'
 
       - name: Set prefix
         id: set-prefix
@@ -62,6 +62,7 @@ jobs:
 
       - name: Container – Configure Project with CMake
         run: |
+          mkdir profiling
           docker exec orphan `
             ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `

From 49ca655e7f11fbc8db64d1c7adb6658938251058 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 21 May 2025 16:42:28 +0700
Subject: [PATCH 155/346] fixes to 2-level scan indexing

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 28 ++++++-------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 7a4d4764f4..eca7ababd2 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -128,14 +128,6 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
-        // set as last element in scan (reduction)
-        // [unroll]
-        // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        // {
-        //     scalar_t reduce_val;
-        //     scratchAccessor.template get<scalar_t>(0,reduce_val);
-        //     dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote<vector_lv0_t>(reduce_val));
-        // }
         scalar_t reduce_val;
         scratchAccessor.template get<scalar_t>(0,reduce_val);
         return reduce_val;
@@ -187,7 +179,9 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
                 scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
-            scratchAccessor.template set<scalar_t>(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -199,14 +193,16 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
+            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
-            scratchAccessor.template get<scalar_t>(virtualSubgroupID,left);
+            scratchAccessor.template get<scalar_t>(bankedIndex,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
                 [unroll]
-                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                    value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
+                for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                    value[i] = binop(left, value[i-1]);
+                value[0] = binop(left, left_last_elem);
             }
             else
             {
@@ -285,14 +281,6 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
-        // set as last element in scan (reduction)
-        // [unroll]
-        // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        // {
-        //     scalar_t reduce_val;
-        //     scratchAccessor.template get<scalar_t>(0,reduce_val);
-        //     dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val);
-        // }
         scalar_t reduce_val;
         scratchAccessor.template get<scalar_t>(0,reduce_val);
         return reduce_val;

From 756f90dfbe77f6532b9abe5d198c19d5f303bbe3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Wed, 21 May 2025 12:00:48 +0200
Subject: [PATCH 156/346] Update build-nabla.yml, go unprotected 0x45 to avoid
 scans slowing down builds

---
 .github/workflows/build-nabla.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index bd50b79499..3faa976c8a 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -25,6 +25,18 @@ jobs:
         tag: ['17.13.6']
 
     steps:
+      - name: Environment Setup
+        run: |
+            Add-MpPreference -ExclusionPath "${{ github.workspace }}"
+            Add-MpPreference -ExclusionExtension "*.*"
+            Add-MpPreference -ExclusionProcess "docker.exe"
+            Add-MpPreference -ExclusionProcess "dockerd.exe"
+            Set-MpPreference -RemediationScheduleDay 8
+            Set-MpPreference -DisableRealtimeMonitoring $true
+            Set-MpPreference -DisableRemovableDriveScanning $true
+            Set-MpPreference -DisableArchiveScanning $true
+            Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
+    
       - name: Checkout
         uses: actions/checkout@v4
         with:

From e2ea8d46fa069f72995c38a9ef2d33090daf68b9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Wed, 21 May 2025 13:19:04 +0200
Subject: [PATCH 157/346] Update build-nabla.yml, correct install directory

---
 .github/workflows/build-nabla.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 3faa976c8a..0af67ba08d 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -15,7 +15,7 @@ jobs:
       cmd: -NoLogo -NoProfile -ExecutionPolicy Bypass
       mount: C:\mount\nabla
       binary: C:\mount\nabla\build-ct
-      install: C:\mount\nabla\build-ct\install
+      install: build-ct\install
 
     strategy:
       fail-fast: false

From 66a49ab55ea090db5fdd9d3dc9b9408cdf384fa7 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Wed, 21 May 2025 14:31:23 +0200
Subject: [PATCH 158/346] Update build-nabla.yml, update shell for prefix setup

---
 .github/workflows/build-nabla.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 0af67ba08d..be333c8f7b 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -44,8 +44,9 @@ jobs:
 
       - name: Set prefix
         id: set-prefix
+        shell: bash
         run: |
-          echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> $GITHUB_OUTPUT
+          echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> "$GITHUB_OUTPUT"
 
       - name: Pull Image
         run: |

From 731f0776abf094ba22af69db0c401fb3fe0f85ec Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 21 May 2025 15:40:45 +0200
Subject: [PATCH 159/346] fix various sync bugs in AS building

---
 src/nbl/video/utilities/CAssetConverter.cpp | 38 ++++++++++++---------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 7c325cb17d..b357e2e2bb 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2868,7 +2868,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								for (const auto& geom : geoms)
 								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
-									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1);
 									uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
 									if (geom.hasTransform())
 									{
@@ -2892,7 +2892,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 										size = core::alignUp(size,indexSize)+triCount*3*indexSize;
 										alignment = hlsl::max<uint16_t>(indexSize,alignment);
 									}
-									inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment);
+									//inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment);
 									incrementBuildSize(size,alignment);
 								}
 							}
@@ -2908,7 +2908,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					}
 					//
 					incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment);
-					inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize);
+					//inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize);
 
 					// we need to save the buffer in a side-channel for later
 					auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
@@ -4632,7 +4632,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
 					core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
 					const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible();
-					if (manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway
+					if (deviceASBuildScratchPtr && manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway
 						flushRanges.reserve(asCount*2);
 					// lambdas!
 					auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool
@@ -4644,10 +4644,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
 							return true;
 						}
-						else if (const SBufferRange<IGPUBuffer> range={.offset=offset,.size=size,.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback))
-							return true;
 						else
-							return false;
+						{
+							const SBufferRange<IGPUBuffer> range={.offset=offset,.size=size,.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
+							const bool retval = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback);
+							// current recording buffer may have changed
+							xferCmdBuf = params.transfer->getCommandBufferForRecording();
+							return retval;
+						}
 					};
 					//
 					core::vector<typename AccelerationStructure::DeviceBuildInfo> buildInfos;
@@ -4849,7 +4853,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								for (const auto& geom : canonical->getTriangleGeometries())
 								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
-									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
+									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1);
 									uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
 									if (geom.hasTransform())
 									{
@@ -4876,7 +4880,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 									allocSizes.push_back(size);
 									alignments.push_back(alignment);
 									const auto tmp = asToBuild.second.scratchSize;
-									logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp);
+									//logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp);
 								}
 							}
 						}
@@ -4884,7 +4888,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
 						auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
 						const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size();
-						logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0));
+						//logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0));
 						for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++)
 						{
 							if (t==1) // don't flush right away cause allocator not defragmented yet
@@ -5042,8 +5046,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
 									auto& outGeom = triangles.emplace_back();
-									auto offset = *(offsetIt++);
-									auto size = geom.vertexStride*geom.maxVertex;
+									const auto origSize = *(sizeIt++);
+									const auto origOffset = *(offsetIt++);
+									auto offset = origOffset;
+									auto size = geom.vertexStride*(geom.maxVertex+1);
 									for (auto i=0; i<2; i++)
 									if (geom.vertexData[i]) // could assert that it must be true for i==0
 									{
@@ -5073,11 +5079,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 											size = triCount*3*alignment;
 											memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.indexData.buffer->getPointer())+geom.indexData.offset;
 											success = streamDataToScratch(offset,size,memcpyCallback);
+											offset += size;
 											break;
 										}
 										default:
 											break;
 									}
+									assert(offset-origOffset<=origSize);
 									if (!success)
 										break;
 									outGeom.maxVertex = geom.maxVertex;
@@ -5091,8 +5099,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size();
 							rangeInfos.push_back(reinterpret_cast<const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const&>(geometryRangeInfoOffset));
 						}
-						// current recording buffer may have changed
-						xferCmdBuf = params.transfer->getCommandBufferForRecording();
 						if (!success)
 						{
 							rangeInfos.resize(buildInfos.size());
@@ -5161,7 +5167,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				)
 				{
 					// clean AS builds, pipeline barrier, query reset and writes need to get executed before we start waiting on the results
-					drainCompute();
+					drainBoth();
 					// get queries
 					core::vector<size_t> sizes(compactions.size());
 					if (!device->getQueryPoolResults(queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT))
@@ -5301,7 +5307,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					};
 					// submit because we want to launch BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch and more is available for TLAS builds
 					if (pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!"))
-						drainCompute();
+						drainBoth();
 					else
 						failedBLASBarrier = true;
 				}

From 11813217a82561331b5f53ad20b21a66e9ce9506 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 21 May 2025 16:14:16 +0200
Subject: [PATCH 160/346] make BLAS tracking actually work

---
 include/nbl/video/IGPUAccelerationStructure.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index b7c1858130..32ad54159a 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -672,7 +672,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		// this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission
 		inline build_ver_t registerNextBuildVer()
 		{
-			return m_pendingBuildVer++;
+			return ++m_pendingBuildVer;
 		}
 		// 
 		using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;

From a639145bb2071855f83b4f2139c3a08203f09353 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 11:56:29 +0700
Subject: [PATCH 161/346] fixes to 3-level scan and minor stuff

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    |  7 +--
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 62 ++++++++++++-------
 2 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 5263a3fec8..04cbcaef4d 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -61,8 +61,8 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
 
     // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << SubgroupsPerVirtualWorkgroupLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << _SubgroupsPerVirtualWorkgroupLog2;
 
     using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
@@ -83,7 +83,7 @@ struct ArithmeticConfiguration
 
     static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation)
     {
-        return (id & (itemsPerInvocation-1)) * SubgroupsPerVirtualWorkgroup + (id/itemsPerInvocation);
+        return (id & (itemsPerInvocation-1)) * SubgroupSize + (id/itemsPerInvocation);
     }
 };
 
@@ -96,7 +96,6 @@ struct is_configuration<ArithmeticConfiguration<W,S,I> > : bool_constant<true> {
 template<typename T>
 NBL_CONSTEXPR bool is_configuration_v = is_configuration<T>::value;
 
-
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index eca7ababd2..d44271a260 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -120,7 +120,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
 
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
@@ -176,12 +176,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+prevIndex,lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -258,7 +258,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
@@ -275,7 +275,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             scratchAccessor.template set<scalar_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
@@ -324,15 +324,20 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::SubgroupsPerVirtualWorkgroup*Config::ItemsPerInvocation_1;
+        const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() < lv1_smem_size)
         {
             vector_lv1_t lv1_val;
+            const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+prevIndex,lv1_val[i]);
+            lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
             if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
             {
                 const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
@@ -351,21 +356,30 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template get<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
             lv2_val[0] = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val[0], bool(invocationIndex));
-            vector_lv2_t shiftedScan = inclusiveScan2(lv2_val);
-
-            // combine with level 1, only last element of each
+            lv2_val = inclusiveScan2(lv2_val);
             [unroll]
-            for (uint32_t i = 0; i < Config::SubgroupsPerVirtualWorkgroup; i++)
-            {
-                scalar_t last_val;
-                scratchAccessor.template get<scalar_t>((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val);
-                scalar_t val = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val, bool(i));
-                val = binop(last_val, shiftedScan[Config::ItemsPerInvocation_2-1]);
-                scratchAccessor.template set<scalar_t>((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val);
-            }
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template set<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+invocationIndex,lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
+        // combine with level 1
+        if (glsl::gl_SubgroupID() < lv1_smem_size)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+
+            scalar_t lv2_scan;
+            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+            scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv2_scan);
+
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex, binop(lv1_val[i],lv2_scan));
+        }
+
         // combine with level 0
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
@@ -373,15 +387,17 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);   // idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
-            const scalar_t left;
-            scratchAccessor.template get<scalar_t>(virtualSubgroupID, left);
+            const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
+            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            scalar_t left;
+            scratchAccessor.template get<scalar_t>(bankedIndex,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
                 [unroll]
-                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                    value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0)));
+                for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                    value[i] = binop(left, value[i-1]);
+                value[0] = binop(left, left_last_elem);
             }
             else
             {

From 7751359a78b5ba7dad595aa04515c4fce3042bf1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 15:14:50 +0700
Subject: [PATCH 162/346] some minor fixes

---
 examples_tests                                             | 2 +-
 include/nbl/builtin/hlsl/subgroup2/ballot.hlsl             | 2 ++
 include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl | 5 +----
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples_tests b/examples_tests
index 0ccd26fc93..13ae89f7d3 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 0ccd26fc93d22587219b12291f855929949cef74
+Subproject commit 13ae89f7d3fc666124486b5e18f13922995d3569
diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
index 52ae6de2d9..3b511126b4 100644
--- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
+++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
@@ -4,6 +4,8 @@
 #ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_
 
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl"
+
 namespace nbl 
 {
 namespace hlsl
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 04cbcaef4d..512641abb8 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -22,6 +22,7 @@ struct virtual_wg_size_log2
     // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
     NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
+    // must have at least enough level 0 outputs to feed a single subgroup
 };
 
 template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
@@ -60,10 +61,6 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
 
-    // must have at least enough level 0 outputs to feed a single subgroup
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << _SubgroupsPerVirtualWorkgroupLog2;
-
     using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;

From 9f43c02bab2d70e4f59ce7a2f50b9580e2583691 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 15:28:25 +0700
Subject: [PATCH 163/346] Return Subgroup size to IPipelineBase

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 28 ++++++++++++------------
 include/nbl/asset/ICPUPipeline.h         |  2 ++
 include/nbl/asset/IComputePipeline.h     | 18 +--------------
 include/nbl/asset/IPipeline.h            | 15 +++++++++++++
 4 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index dcdcfb495e..4a7ee3b695 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -26,20 +26,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
         }
 
-        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
-        {
-            auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
-            newPipeline->m_params = m_params;
-            newPipeline->m_renderpass = m_renderpass;
-            
-            for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
-            {
-                newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth);
-            }
-
-            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
-        }
-
         constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
@@ -121,6 +107,20 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
               if (info.shader) dependants.insert(info.shader.get());
             return dependants;
         }
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
+            newPipeline->m_params = m_params;
+            newPipeline->m_renderpass = m_renderpass;
+            
+            for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
+            {
+                newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth);
+            }
+
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 8fe7e38391..435aca5d40 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -70,6 +70,8 @@ class ICPUPipelineBase
             core::smart_refctd_ptr<IShader> shader = nullptr;
             std::string entryPoint = "";
 
+            IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+
             // Container choice implicitly satisfies:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
             core::unordered_map<spec_constant_id_t, SSpecConstantValue> entries;
diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h
index 4f439d7100..9ccef877c3 100644
--- a/include/nbl/asset/IComputePipeline.h
+++ b/include/nbl/asset/IComputePipeline.h
@@ -9,26 +9,10 @@ namespace nbl::asset
 class IComputePipelineBase : public virtual core::IReferenceCounted
 {
   public:
-    // Nabla requires device's reported subgroup size to be between 4 and 128
-    enum class SUBGROUP_SIZE : uint8_t
-    {
-      // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
-      UNKNOWN = 0,
-      // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
-      VARYING = 1,
-      // The rest we encode as log2(x) of the required value
-      REQUIRE_4 = 2,
-      REQUIRE_8 = 3,
-      REQUIRE_16 = 4,
-      REQUIRE_32 = 5,
-      REQUIRE_64 = 6,
-      REQUIRE_128 = 7
-    };
 
     struct SCachedCreationParams final
     {
-        SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
-        uint8_t requireFullSubgroups : 1 = false;
+        uint8_t requireFullSubgroups = false;
     };
 };
 
diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index eb64de0b0d..c458c34afe 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -105,6 +105,21 @@ class IPipelineBase
       };
       using FLAGS = CreationFlags;
 
+      // Nabla requires device's reported subgroup size to be between 4 and 128
+      enum class SUBGROUP_SIZE : uint8_t
+      {
+        // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
+        UNKNOWN = 0,
+        // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
+        VARYING = 1,
+        // The rest we encode as log2(x) of the required value
+        REQUIRE_4 = 2,
+        REQUIRE_8 = 3,
+        REQUIRE_16 = 4,
+        REQUIRE_32 = 5,
+        REQUIRE_64 = 6,
+        REQUIRE_128 = 7
+      };
 
 };
 template<typename PipelineLayout>

From bae94c58e8c73a7b111d0edfa4017a8770803809 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 15:29:56 +0700
Subject: [PATCH 164/346] Fix missing bracket for getLayout

---
 include/nbl/asset/ICPUPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 435aca5d40..c7fe9b49e0 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -125,7 +125,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
 
             core::smart_refctd_ptr<ICPUPipelineLayout> layout;
             if (_depth > 0u) 
-              layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(getLayout->clone(_depth-1u));
+              layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(getLayout()->clone(_depth - 1u));
 
             return clone_impl(std::move(layout), _depth);
         }

From 0d8fe94aefe5d820c43d26d9a6235951f2969c6b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 15:30:17 +0700
Subject: [PATCH 165/346] Return Subgroup Size to every SShaderSpecInfo

---
 include/nbl/video/IGPUPipeline.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index ff6d97f17b..f9a32786bf 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -91,6 +91,9 @@ class IGPUPipelineBase {
             const asset::IShader* shader = nullptr;
             std::string_view entryPoint = "";
 
+            asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+
+
             // Container choice implicitly satisfies:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
             const core::unordered_map<spec_constant_id_t, SSpecConstantValue>* entries;

From 4ed04c83eb5162747c7dce9514c9e445b4ffd941 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 15:30:36 +0700
Subject: [PATCH 166/346] Fix stagePresence typo

---
 include/nbl/video/IGPUGraphicsPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index ae8924a1ab..c44ef5ceb1 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -53,7 +53,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 {
                     if (!specInfo.shader) return true;
                     if (!specInfo.accumulateSpecializationValidationResult(&retval)) return false;
-                    stagePresence != stage;
+                    stagePresence |= stage;
                     return true;
                 };
                 if (!processSpecInfo(vertexShader, hlsl::ShaderStage::ESS_VERTEX)) return {};

From 7e2fd2ceb81c301a480c50957b65a05915f023c9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 15:30:56 +0700
Subject: [PATCH 167/346] Move clone_impl to private

---
 include/nbl/asset/ICPURayTracingPipeline.h | 44 +++++++++++-----------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 2b04a2f41b..ed2c5d2409 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -31,27 +31,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return core::smart_refctd_ptr<ICPURayTracingPipeline>(retval,core::dont_grab);
         }
 
-        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
-        {
-            auto newPipeline = new ICPURayTracingPipeline(layout.get());
-            newPipeline->m_raygen = m_raygen.clone(depth);
-
-            auto cloneSpecInfos = [depth](const core::vector<SShaderSpecInfo>& specInfos) -> core::vector<SShaderSpecInfo> {
-                core::vector<SShaderSpecInfo> results;
-                results.resize(specInfos.size());
-                for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++)
-                    results[specInfo_i] = specInfos[specInfo_i].clone(depth);
-                return results;
-            };
-            newPipeline->m_misses = cloneSpecInfos(m_misses);
-            newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits);
-            newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits);
-            newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections);
-            newPipeline->m_callables = cloneSpecInfos(m_callables);
-
-            newPipeline->m_params = m_params;
-            return core::smart_refctd_ptr<base_t>(newPipeline);
-        }
+        
 
         constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
@@ -118,6 +98,28 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             for (const auto& callableInfo : self->m_callables) dependants.insert(callableInfo.shader.get());
             return dependants;
         }
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto newPipeline = new ICPURayTracingPipeline(layout.get());
+            newPipeline->m_raygen = m_raygen.clone(depth);
+
+            auto cloneSpecInfos = [depth](const core::vector<SShaderSpecInfo>& specInfos) -> core::vector<SShaderSpecInfo> {
+                core::vector<SShaderSpecInfo> results;
+                results.resize(specInfos.size());
+                for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++)
+                    results[specInfo_i] = specInfos[specInfo_i].clone(depth);
+                return results;
+            };
+            newPipeline->m_misses = cloneSpecInfos(m_misses);
+            newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits);
+            newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits);
+            newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections);
+            newPipeline->m_callables = cloneSpecInfos(m_callables);
+
+            newPipeline->m_params = m_params;
+            return core::smart_refctd_ptr<base_t>(newPipeline);
+        }
 };
 
 }

From c01392c93f7f7490131b7bc9bb4aa56b2140ba34 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 15:31:16 +0700
Subject: [PATCH 168/346] Implement getSpecInfoVec for ICPURayTracingPipeline

---
 include/nbl/asset/ICPURayTracingPipeline.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index ed2c5d2409..5819099887 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -65,6 +65,28 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return {};
         }
 
+        inline std::vector<SShaderSpecInfo>& getSpecInfoVec(hlsl::ShadeStage stage)
+        {
+            if (!isMutable()) return {};
+            switch (stage) 
+            {
+                // raygen is not stored as vector so we can't return it here. Use getSpecInfo
+                case hlsl::ShaderStage::ESS_MISS:
+                  return m_misses;
+                case hlsl::ShaderStage::ESS_ANY_HIT:
+                  return m_hitGroups.anyHits;
+                case hlsl::ShaderStage::ESS_CLOSEST_HIT:
+                  return m_hitGroups.closestHits;
+                case hlsl::ShaderStage::ESS_INTERSECTION:
+                  return m_hitGroups.intersections;
+                case hlsl::ShaderStage::ESS_CALLABLE:
+                  return m_callables;
+
+            }
+            return {};
+        }
+
+
         inline virtual bool valid() const override final
         {
             // TODO(kevinyu): Fix this temporary dummy code

From fd6f527f55b6cea8f4912642c92cb9fc572aa41a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 17:03:32 +0700
Subject: [PATCH 169/346] latest example

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 13ae89f7d3..a8774db88d 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 13ae89f7d3fc666124486b5e18f13922995d3569
+Subproject commit a8774db88d1d08d0a3fe9f2a30e7dc376120493a

From 9a3cc695fbcb7508c0266fe7798ced0b18f9e9ed Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 12:28:28 +0200
Subject: [PATCH 170/346] default AS patch constructor default values so
 patches merge correctly

---
 include/nbl/video/utilities/CAssetConverter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index 182b025ada..682b3887a0 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -175,7 +175,7 @@ class CAssetConverter : public core::IReferenceCounted
 				//! select build flags
 				uint8_t allowUpdate : 1 = false;
 				uint8_t allowCompaction : 1 = false;
-				BuildPreference preference : 2 = BuildPreference::Invalid;
+				BuildPreference preference : 2 = BuildPreference::None;
 				uint8_t lowMemory : 1 = false;
 				//! things that control the build
 				uint8_t hostBuild : 1 = false; // DO NOT USE, will get overriden to false anyway
@@ -187,7 +187,7 @@ class CAssetConverter : public core::IReferenceCounted
 				template<typename CRTP>
 				std::pair<bool,CRTP> combine_impl(const CRTP& _this, const CRTP& other) const
 				{
-					if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid)
+					if (_this.preference!=other.preference && _this.preference!=BuildPreference::None && other.preference!=BuildPreference::None)
 						return {false,_this};
 					CRTP retval = _this;
 					retval.isMotion |= other.isMotion;

From 7b3c0edd4c40380caec7735f3f903483c156bfed Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 18:35:51 +0700
Subject: [PATCH 171/346] Fix getSpecInfoVec

---
 include/nbl/asset/ICPURayTracingPipeline.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 5819099887..8be23ffe64 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -65,25 +65,25 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return {};
         }
 
-        inline std::vector<SShaderSpecInfo>& getSpecInfoVec(hlsl::ShadeStage stage)
+        inline core::vector<SShaderSpecInfo>* getSpecInfoVec(hlsl::ShaderStage stage)
         {
-            if (!isMutable()) return {};
+            if (!isMutable()) return nullptr;
             switch (stage) 
             {
                 // raygen is not stored as vector so we can't return it here. Use getSpecInfo
                 case hlsl::ShaderStage::ESS_MISS:
-                  return m_misses;
+                  return &m_misses;
                 case hlsl::ShaderStage::ESS_ANY_HIT:
-                  return m_hitGroups.anyHits;
+                  return &m_hitGroups.anyHits;
                 case hlsl::ShaderStage::ESS_CLOSEST_HIT:
-                  return m_hitGroups.closestHits;
+                  return &m_hitGroups.closestHits;
                 case hlsl::ShaderStage::ESS_INTERSECTION:
-                  return m_hitGroups.intersections;
+                  return &m_hitGroups.intersections;
                 case hlsl::ShaderStage::ESS_CALLABLE:
-                  return m_callables;
+                  return &m_callables;
 
             }
-            return {};
+            return nullptr;
         }
 
 

From 96db32b8bcc55dd9dfc49d1ec9117fec4f329fdd Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 22 May 2025 18:36:09 +0700
Subject: [PATCH 172/346] Implement ICPURayTracingPIpeline valid

---
 include/nbl/asset/ICPURayTracingPipeline.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 8be23ffe64..618c851883 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -89,7 +89,9 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
 
         inline virtual bool valid() const override final
         {
-            // TODO(kevinyu): Fix this temporary dummy code
+            if (!m_layout) return false;
+            if (!m_layout->valid()) return false;
+            if (m_raygen.valid() == SShaderSpecInfo::INVALID_SPEC_INFO) return false;
             return true;
         }
 

From 02c0d94b54e2ed0df8597e6157e5a73e54cbf94d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 14:17:28 +0200
Subject: [PATCH 173/346] forgot to overwrite staging cache XD

---
 src/nbl/video/utilities/CAssetConverter.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index b357e2e2bb..c69d373656 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -5276,6 +5276,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								auto& resultOutput = std::get<SReserveResult::vector_t<CPUAccelerationStructure>>(reservations.m_gpuObjects);
 								resultOutput[foundIx->second].value = compactedAS;
 							}
+							// overwrite staging cache
+							auto pFound = findInStaging.template operator()<CPUAccelerationStructure>(srcAS);
+							pFound->second.gpuRef = compactedAS;
 							// insert into compaction map
 							retval[srcAS] = std::move(compactedAS);
 						}

From 98f3153a21c755b30b0a9c89c28734ff1216426f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 13:11:55 +0700
Subject: [PATCH 174/346] Fix ICPUSkeleton.h computeDependants

---
 include/nbl/asset/ICPUSkeleton.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 51be7acc5a..a29adbabbc 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -94,9 +94,7 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
       requires(std::same_as<std::remove_cv_t<Self>, ICPUSkeleton>)
     static auto computeDependantsImpl(Self* self) {
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        core::unordered_set<asset_ptr_t> dependants;
-        return { self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() };
-        return dependants;
+        return core::unordered_set<asset_ptr_t>{ self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() };
     }
 };
 

From 30f35af1f9fdd14a48994862e3214c08d8c38710 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 13:12:17 +0700
Subject: [PATCH 175/346] Small fixes

---
 include/nbl/asset/ICPUAccelerationStructure.h | 2 +-
 include/nbl/asset/ICPUDescriptorSetLayout.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 3ac794a888..73365cbfce 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -272,7 +272,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
 			core::unordered_set<const IAsset*> dependants;
-			for (const auto& instance : m_instances)
+			for (const auto& instance : *m_instances)
 				dependants.insert(instance.getBase().blas.get());
 			return dependants;
 		}
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index b2c06792d6..aea1520b6f 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -78,7 +78,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
           using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
           core::unordered_set<asset_ptr_t> dependants;
           if (!self->m_immutableSamplers) return dependants;
-          for (const auto& sampler: self->m_immutableSamplers)
+          for (const auto& sampler: *self->m_immutableSamplers)
           {
               dependants.insert(sampler.get());
           }

From 2983ff09b649e586867ebc417869f4422bd9a764 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 13:12:32 +0700
Subject: [PATCH 176/346] Remove redundant final specifier

---
 include/nbl/asset/ICPUComputePipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index f6b689857f..27d16461a2 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -39,7 +39,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
             return computeDependantsImpl(this);
         }
 
-        inline std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override
         {
             if (stage==hlsl::ShaderStage::ESS_COMPUTE)
                 return {&m_specInfo,1};

From e218e7770e0c92c543f1ea017cd1204ac0375002 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 13:12:55 +0700
Subject: [PATCH 177/346] Remove const so it can be cast to IAsset*

---
 include/nbl/asset/IPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index c458c34afe..d2a85c42fb 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -133,7 +133,7 @@ class IPipeline : public IPipelineBase
       inline IPipeline(core::smart_refctd_ptr<const PipelineLayout>&& _layout)
         : m_layout(std::move(_layout)) {}
 
-      core::smart_refctd_ptr<const PipelineLayout> m_layout;
+      core::smart_refctd_ptr<PipelineLayout> m_layout;
 };
 
 }

From b58e486d505f9dd2f030322b16a34e029e2964c1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 13:13:15 +0700
Subject: [PATCH 178/346] Fix RenderpassIndependentPipeline

---
 include/nbl/asset/ICPURenderpassIndependentPipeline.h | 6 ++++++
 include/nbl/asset/IRenderpassIndependentPipeline.h    | 5 -----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index 628785d2ab..fbff6ee312 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -19,6 +19,12 @@ namespace nbl::asset
 class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, public IAsset
 {
 	public:
+    struct SCreationParams
+    {
+        std::span<const ICPUPipelineBase::SShaderSpecInfo> shaders = {};
+        SCachedCreationParams cached = {};
+    };
+
 		//(TODO) it is true however it causes DSs to not be cached when ECF_DONT_CACHE_TOP_LEVEL is set which isnt really intuitive
 		constexpr static inline uint32_t DESC_SET_HIERARCHYLEVELS_BELOW = 0u;
 		// TODO: @Crisspl HOW ON EARTH DOES THIS MAKE SENSE!?
diff --git a/include/nbl/asset/IRenderpassIndependentPipeline.h b/include/nbl/asset/IRenderpassIndependentPipeline.h
index 7f33b6abc4..feeaff7c99 100644
--- a/include/nbl/asset/IRenderpassIndependentPipeline.h
+++ b/include/nbl/asset/IRenderpassIndependentPipeline.h
@@ -28,11 +28,6 @@ class IRenderpassIndependentPipeline
             SRasterizationParams rasterization = {};
             SBlendParams blend = {};
         };
-        struct SCreationParams
-        {
-            std::span<const IPipelineBase::SShaderSpecInfo> shaders = {};
-            SCachedCreationParams cached = {};
-        };
 
         inline const SCachedCreationParams& getCachedCreationParams() const {return m_cachedParams;}
 

From 1f3a4775530484bca85ddf8dc46b5e0bc0c46aa1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 13:15:23 +0700
Subject: [PATCH 179/346] Fix SpirvIntrospector

---
 include/nbl/asset/ICPUPipeline.h             |  8 +++++---
 include/nbl/asset/utils/CSPIRVIntrospector.h |  4 ++--
 src/nbl/asset/utils/CSPIRVIntrospector.cpp   | 20 ++++++++++----------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index c7fe9b49e0..9674b872e0 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -72,9 +72,10 @@ class ICPUPipelineBase
 
             IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
 
+            using spec_constant_map_t = core::unordered_map<spec_constant_id_t, SSpecConstantValue>;
             // Container choice implicitly satisfies:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
-            core::unordered_map<spec_constant_id_t, SSpecConstantValue> entries;
+            spec_constant_map_t entries;
             // By requiring Nabla Core Profile features we implicitly satisfy:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
@@ -92,7 +93,7 @@ class ICPUPipelineBase
             }
         };
 
-        virtual std::span<const SShaderSpecInfo> getSpecInfo(const hlsl::ShaderStage stage) const = 0;
+        virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const = 0;
 
 };
 
@@ -130,7 +131,8 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
             return clone_impl(std::move(layout), _depth);
         }
 
-        inline std::span<SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage)
+        // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfo. It always use the const variant. Will check on it later.
+        inline std::span<SShaderSpecInfo> getSpecInfoMut(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return {};
             const auto specInfo = const_cast<const this_t*>(this)->getSpecInfo(stage);
diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h
index 3d6455e020..fa497f08aa 100644
--- a/include/nbl/asset/utils/CSPIRVIntrospector.h
+++ b/include/nbl/asset/utils/CSPIRVIntrospector.h
@@ -582,7 +582,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 				}
 
 				// returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants)
-				bool merge(const CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr);
+				bool merge(const CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr);
 
 				//
 				core::smart_refctd_dynamic_array<SPushConstantRange> createPushConstantRangesFromIntrospection(core::smart_refctd_ptr<const CStageIntrospectionData>& introspection);
@@ -643,7 +643,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 		}
 
 		//! creates pipeline for a single IShader
-		core::smart_refctd_ptr<ICPUComputePipeline> createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout=nullptr);
+		core::smart_refctd_ptr<ICPUComputePipeline> createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout=nullptr);
 
 #if 0 // wait until Renderpass Indep completely gone and Graphics Pipeline is used in a new way && Graphics Pipeline Libraries
 		struct CShaderStages
diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
index 8b43c676b7..214ffdddbb 100644
--- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp
+++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
@@ -3,6 +3,8 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/asset/utils/CSPIRVIntrospector.h"
+
+#include "nbl/asset/ICPUPipeline.h"
 #include "nbl/asset/utils/spvUtils.h"
 
 #include "nbl_spirv_cross/spirv_parser.hpp"
@@ -106,15 +108,15 @@ static CSPIRVIntrospector::CStageIntrospectionData::VAR_TYPE spvcrossType2E_TYPE
     }
 }
 
-core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout/* = nullptr*/)
+core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout/* = nullptr*/)
 {
-    if (info.stage!=IShader::E_SHADER_STAGE::ESS_COMPUTE || info.valid()==IPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO)
+    if (info.valid()==ICPUPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO)
         return nullptr;
 
     CStageIntrospectionData::SParams params;
     params.entryPoint = info.entryPoint;
     params.shader = core::smart_refctd_ptr<const IShader>(info.shader);
-    params.stage = info.stage;
+    params.stage = hlsl::ShaderStage::ESS_COMPUTE;
 
     auto introspection = introspect(params);
 
@@ -174,15 +176,13 @@ core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximat
         layout = pplnIntrospectData->createApproximatePipelineLayoutFromIntrospection(introspection);
     }
 
-    ICPUComputePipeline::SCreationParams pplnCreationParams;
-    pplnCreationParams.layout = layout.get();
-    pplnCreationParams.shader = info;
-    pplnCreationParams.layout = layout.get();
-    return ICPUComputePipeline::create(pplnCreationParams);
+    auto pipeline = ICPUComputePipeline::create(layout.get());
+    pipeline->getSpecInfoMut(hlsl::ShaderStage::ESS_COMPUTE)[0] = info;
+    return pipeline;
 }
 
 // returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants)
-NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants)
+NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants)
 {
     if (!stageData)
         return false;
@@ -218,7 +218,7 @@ NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRV
                         if (specConstantFound == specConstants->end())
                             return false;
 
-                        descInfo.count = specConstantFound->second;
+                        descInfo.count = (specConstantFound->second.size() != 0);
                     }
                     else
                     {

From 5b6e20e8f27af143870735f30ddd82068c2a8503 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 10:55:07 +0200
Subject: [PATCH 180/346] keep a pending TLAS build BLAS tracking set linked
 list

Make the Acceleration Structure Copy Structs strongly typed
---
 include/nbl/video/IGPUAccelerationStructure.h | 181 +++++++++++-------
 include/nbl/video/IGPUCommandBuffer.h         |  51 ++++-
 include/nbl/video/ILogicalDevice.h            |  30 ++-
 include/nbl/video/IQueue.h                    |   7 +-
 src/nbl/video/CVulkanAccelerationStructure.h  |  33 ----
 src/nbl/video/CVulkanCommandBuffer.cpp        |  21 +-
 src/nbl/video/CVulkanCommandBuffer.h          |   6 +-
 src/nbl/video/CVulkanLogicalDevice.cpp        |  21 +-
 src/nbl/video/CVulkanLogicalDevice.h          |   6 +-
 src/nbl/video/IGPUCommandBuffer.cpp           |  49 +++--
 src/nbl/video/IQueue.cpp                      |  32 ++--
 src/nbl/video/utilities/CAssetConverter.cpp   |   2 +-
 12 files changed, 268 insertions(+), 171 deletions(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 32ad54159a..68b4c1940b 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -98,39 +98,6 @@ class IGPUAccelerationStructure : public IBackendObject
 				}
 		};
 
-		// copies
-		enum class COPY_MODE : uint8_t
-		{
-			CLONE = 0,
-			COMPACT = 1,
-			SERIALIZE = 2,
-			DESERIALIZE = 3,
-		};
-		struct CopyInfo
-		{
-			const IGPUAccelerationStructure* src = nullptr;
-			IGPUAccelerationStructure* dst = nullptr;
-			COPY_MODE mode = COPY_MODE::CLONE;
-		};
-		template<typename BufferType>  requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
-		struct CopyToMemoryInfo
-		{
-			const IGPUAccelerationStructure* src = nullptr;
-			asset::SBufferBinding<BufferType> dst = nullptr;
-			COPY_MODE mode = COPY_MODE::SERIALIZE;
-		};
-		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
-		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
-		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
-		struct CopyFromMemoryInfo
-		{
-			asset::SBufferBinding<const BufferType> src = nullptr;
-			IGPUAccelerationStructure* dst = nullptr;
-			COPY_MODE mode = COPY_MODE::DESERIALIZE;
-		};
-		using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo<IGPUBuffer>;
-		using HostCopyFromMemoryInfo = CopyFromMemoryInfo<asset::ICPUBuffer>;
-
 		// this will return false also if your deferred operation is not ready yet, so please use in combination with `isPending()`
 		virtual bool wasCopySuccessful(const IDeferredOperation* const deferredOp) = 0;
 
@@ -176,6 +143,30 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 
 		inline bool usesMotion() const override {return m_params.flags.hasFlags(SCreationParams::FLAGS::MOTION_BIT);}
 
+		// copies
+		struct CopyInfo
+		{
+			const IGPUBottomLevelAccelerationStructure* src = nullptr;
+			IGPUAccelerationStructure* dst = nullptr;
+			bool compact = false;
+		};
+		template<typename BufferType>  requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyToMemoryInfo
+		{
+			const IGPUBottomLevelAccelerationStructure* src = nullptr;
+			asset::SBufferBinding<BufferType> dst = nullptr;
+		};
+		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
+		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyFromMemoryInfo
+		{
+			asset::SBufferBinding<const BufferType> src = nullptr;
+			IGPUBottomLevelAccelerationStructure* dst = nullptr;
+		};
+		using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo<IGPUBuffer>;
+		using HostCopyFromMemoryInfo = CopyFromMemoryInfo<asset::ICPUBuffer>;
+
 		// read the comments in the .hlsl file, AABB builds ignore certain fields
 		using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; // TODO: rename to GeometryRangeInfo, and make `BuildRangeInfo = const GeometryRangeInfo*`
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*;
@@ -388,6 +379,34 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		//
 		inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;}
 
+		// copies
+		struct CopyInfo
+		{
+			const IGPUTopLevelAccelerationStructure* src = nullptr;
+			IGPUTopLevelAccelerationStructure* dst = nullptr;
+			bool compact = false;
+		};
+		template<typename BufferType>  requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyToMemoryInfo
+		{
+			const IGPUTopLevelAccelerationStructure* src = nullptr;
+			asset::SBufferBinding<BufferType> dst = nullptr;
+			// [optional] Query the tracked BLASes
+			core::smart_refctd_dynamic_array<core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> trackedBLASes = nullptr;
+		};
+		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
+		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyFromMemoryInfo
+		{
+			asset::SBufferBinding<const BufferType> src = nullptr;
+			IGPUTopLevelAccelerationStructure* dst = nullptr;
+			// [optional] Provide info about what BLAS references to hold onto after the copy. For performance make sure the list is compact (without repeated elements).
+			std::span<const IGPUBottomLevelAccelerationStructure*> trackedBLASes = {};
+		};
+		using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo<IGPUBuffer>;
+		using HostCopyFromMemoryInfo = CopyFromMemoryInfo<asset::ICPUBuffer>;
+
 		// read the comments in the .hlsl file
 		using BuildRangeInfo = hlsl::acceleration_structures::top_level::BuildRangeInfo;
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo*;
@@ -677,61 +696,87 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		// 
 		using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
 		// returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked`
-		inline build_ver_t getTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const uint32_t first=0) const
+		inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const
 		{
 			if (!count)
-				return 0;
+				return;
 			// stop multiple threads messing with us
 			std::lock_guard lk(m_trackingLock);
-			const uint32_t toWrite = std::min<uint32_t>(std::max<uint32_t>(m_trackedBLASes.size(),first)-first,tracked ? (*count):0xffFFffFFu);
-			*count = toWrite;
-			if (tracked && toWrite)
-			{
-				auto it = m_trackedBLASes.begin();
-				// cmon its an unordered map, iterator should have operator +=
-				for (auto i=0; i<first; i++)
-					it++;
-				for (auto i=0; i<toWrite; i++)
-					*(tracked++) = *(it++);
-			}
-			return m_completedBuildVer;
+			auto pBLASes = getPendingBuildTrackedBLASes(buildVer);
+			*count = pBLASes ? pBLASes->size():0;
+			if (!tracked || !pBLASes)
+				return;
+			for (auto it=pBLASes->begin(); it!=pBLASes->end(); it++)
+				*(tracked++) = *(it++);
 		}
-		// Useful if TLAS got built externally as well, returns if there were no later builds that preempted us setting the result here
+		// Useful if TLAS got built externally as well
 		template<typename Iterator>
-		inline bool setTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer)
+		inline void insertTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer)
 		{
+			if (buildVer==0)
+				return;
 			// stop multiple threads messing with us
 			std::lock_guard lk(m_trackingLock);
-			// stop out of order callbacks
-			if (buildVer<=m_completedBuildVer)
-				return false;
-			m_completedBuildVer = buildVer;
-			// release already tracked BLASes
-			m_trackedBLASes.clear();
-			// sanity check, TODO: this should be an atomic_max on the `m_pendingBuildVer`
-			if (m_completedBuildVer>m_pendingBuildVer)
-				m_pendingBuildVer = m_completedBuildVer;
+			// insert in the right order
+			auto prev = m_pendingBuilds.before_begin();
+			for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>buildVer; prev=it++) {}
+			auto inserted = m_pendingBuilds.emplace_after(prev);
 			// now fill the contents
-			m_trackedBLASes.insert(begin,end);
-			return true;
+			inserted->BLASes.insert(begin,end);
+			inserted->ordinal = buildVer;
+		}
+		template<typename Iterator>
+		inline build_ver_t pushTrackedBLASes(const Iterator begin, const Iterator end)
+		{
+			const auto buildVer = registerNextBuildVer();
+			insertTrackedBLASes<Iterator>(begin,end,buildVer);
+			return buildVer;
 		}
-		// a little utility to make sure nothing from this build version and before gets tracked
-		inline bool clearTrackedBLASes(const build_ver_t buildVer)
+		// a little utility to make sure nothing from before this build version gets tracked
+		inline void clearTrackedBLASes(const build_ver_t buildVer)
 		{
-			return setTrackedBLASes<const blas_smart_ptr_t*>(nullptr,nullptr,buildVer);
+			// stop multiple threads messing with us
+			std::lock_guard lk(m_trackingLock);
+			clearTrackedBLASes_impl(buildVer);
 		}
 
 	protected:
 		inline IGPUTopLevelAccelerationStructure(core::smart_refctd_ptr<const ILogicalDevice>&& dev, SCreationParams&& params)
 			: Base(), IGPUAccelerationStructure(std::move(dev),std::move(params)),
-			m_maxInstanceCount(params.maxInstanceCount),m_trackedBLASes() {}
-
+			m_maxInstanceCount(params.maxInstanceCount) {}
 		const uint32_t m_maxInstanceCount;
+
+	private:
+		friend class IGPUCommandBuffer;
+		inline const core::unordered_set<blas_smart_ptr_t>* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const
+		{
+			const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;});
+			if (found==m_pendingBuilds.end())
+				return nullptr;
+			return &found->BLASes;
+		}
+		inline void clearTrackedBLASes_impl(const build_ver_t buildVer)
+		{
+			// find first element less or equal to `buildVer`
+			auto prev = m_pendingBuilds.before_begin();
+			for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>=buildVer; prev=it++) {}
+			m_pendingBuilds.erase_after(prev,m_pendingBuilds.end());
+		}
+
+		std::atomic<build_ver_t> m_pendingBuildVer = 0;
 		// TODO: maybe replace with new readers/writers lock
 		mutable std::mutex m_trackingLock;
-		std::atomic<build_ver_t> m_pendingBuildVer = 0;
-		build_ver_t m_completedBuildVer = 0;
-		core::unordered_set<blas_smart_ptr_t> m_trackedBLASes;
+		// TODO: this definitely needs improving with MultiEventTimelines (which also can track deferred Host ops) but then one needs to track semaphore signal-wait deps so we know what "state copy" a compaction wants
+		// Deferred Op must complete AFTER a submit, otherwise race condition.
+		// If we make a linked list of pending builds, then we just need to pop completed builds (traverse until current found)
+		struct STrackingInfo
+		{
+			core::unordered_set<blas_smart_ptr_t> BLASes;
+			// when the build got 
+			build_ver_t ordinal;
+		};
+		// a little misleading, the element is the most recently completed one
+		core::forward_list<STrackingInfo> m_pendingBuilds;
 };
 
 }
diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index d5a3fac0af..98d98ab98a 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -321,9 +321,12 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         }
         
         //! acceleration structure transfers
-        bool copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo);
-        bool copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo);
-        bool copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo);
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        bool copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo);
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        bool copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo);
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        bool copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo);
 
         //! state setup
         bool bindComputePipeline(const IGPUComputePipeline* const pipeline);
@@ -549,7 +552,31 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         bool executeCommands(const uint32_t count, IGPUCommandBuffer* const* const cmdbufs);
 
         // in case you want the commandbuffer to hold onto things as long as its not RESET
-        bool recordReferences(const std::span<const IReferenceCounted*> refs);
+        template<typename Iterator>
+        inline bool recordReferences(Iterator begin, const Iterator end)
+        {
+            auto oit = reserveReferences(std::distance(begin,end));
+            if (oit)
+            while (begin!=end)
+                *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(*(begin++));
+            return oit;
+        }
+        inline bool recordReferences(const std::span<const IReferenceCounted*> refs) {return recordReferences(refs.begin(),refs.end());}
+
+        // in case you want the commandbuffer to overwrite the BLAS tracking, e.g. you recorded TLAS building commands directly using `getNativeHandle()` to get the commandbuffer
+        template<typename Iterator>
+        inline bool recordBLASReferenceOverwrite(IGPUTopLevelAccelerationStructure* tlas, Iterator beginBLASes, const Iterator endBLASes)
+        {
+            const auto size = std::distance(beginBLASes,endBLASes);
+            auto oit = reserveReferences(size);
+            if (oit)
+            {
+                m_TLASToBLASReferenceSets[tlas] = {oit,size};
+                while (beginBLASes!=endBLASes)
+                    *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(*(beginBLASes++));
+            }
+            return oit;
+        }
 
         virtual bool insertDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0;
         virtual bool beginDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0;
@@ -640,9 +667,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             const uint64_t* const pIndirectOffsets, const uint32_t* const pIndirectStrides, const uint32_t* const pMaxInstanceCounts
         ) = 0;
 
-        virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0;
-        virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) = 0;
-        virtual bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) = 0;
+        virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0;
+        virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst) = 0;
+        virtual bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst) = 0;
 
         virtual bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) = 0;
         virtual bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) = 0;
@@ -875,12 +902,13 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         template<typename IndirectCommand> requires nbl::is_any_of_v<IndirectCommand, hlsl::DrawArraysIndirectCommand_t, hlsl::DrawElementsIndirectCommand_t>
         bool invalidDrawIndirectCount(const asset::SBufferBinding<const IGPUBuffer>& indirectBinding, const asset::SBufferBinding<const IGPUBuffer>& countBinding, const uint32_t maxDrawCount, const uint32_t stride);
 
+        core::smart_refctd_ptr<const core::IReferenceCounted>* reserveReferences(const uint32_t size);
 
         // This bound descriptor set record doesn't include the descriptor sets whose layout has _any_ one of its bindings
         // created with IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT
         // or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT.
         core::unordered_map<const IGPUDescriptorSet*,uint64_t> m_boundDescriptorSetsRecord;
-
+        
         // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it.
         // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes.
         // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records.
@@ -905,6 +933,13 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 NBL_ENUM_ADD_BITWISE_OPERATORS(IGPUCommandBuffer::USAGE);
 
 #ifndef _NBL_VIDEO_I_GPU_COMMAND_BUFFER_CPP_
+extern template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::CopyInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+
 extern template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo,IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
     const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo>, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const
 );
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index b23afa2679..0e36c9ace1 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -592,7 +592,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                         {
                             auto tlas = set.first;
                             // we know the build is completed immediately after performing it, so we get our pending stamp then
-                            tlas->setTrackedBLASes(set.second.begin(),set.second.end(),tlas->registerNextBuildVer());
+                            // ideally we should get our build version when the work of the deferred op gets executed for the first time
+                            using iterator = decltype(set.second)::iterator;
+                            struct CustomIterator
+                            {
+                                inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;}
+
+                                inline CustomIterator operator++() {return {ptr++};}
+
+                                inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
+
+                                iterator ptr;
+                            };
+                            const auto buildVer = tlas->pushTrackedBLASes<CustomIterator>({set.second.begin()},{set.second.end()});
+                            tlas->clearTrackedBLASes(buildVer);
                         }
                     }
 
@@ -657,7 +670,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             return writeAccelerationStructuresProperties_impl(accelerationStructures,type,data,stride);
         }
         // Host-side copy, DEFERRAL IS NOT OPTIONAL
-        inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo)
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const AccelerationStructure::CopyInfo& copyInfo)
         {
             if (!acquireDeferredOperation(deferredOperation))
             {
@@ -679,7 +693,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
 
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
-        inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo)
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyToMemoryInfo& copyInfo)
         {
             if (!acquireDeferredOperation(deferredOperation))
             {
@@ -704,7 +719,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 });
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
-        inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo)
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyFromMemoryInfo& copyInfo)
         {
             if (!acquireDeferredOperation(deferredOperation))
             {
@@ -1122,9 +1138,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             const IGPUTopLevelAccelerationStructure::BuildRangeInfo* const pBuildRangeInfos, const uint32_t totalGeometryCount
         ) = 0;
         virtual bool writeAccelerationStructuresProperties_impl(const std::span<const IGPUAccelerationStructure* const> accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) = 0;
-        virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0;
-        virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) = 0;
-        virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) = 0;
+        virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0;
+        virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding<asset::ICPUBuffer>& dst) = 0;
+        virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding<const asset::ICPUBuffer>& src, IGPUAccelerationStructure* dst) = 0;
 
         virtual core::smart_refctd_ptr<IGPUShader> createShader_impl(const asset::ICPUShader* spirvShader) = 0;
 
diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h
index 28336b15cc..c52e30517f 100644
--- a/include/nbl/video/IQueue.h
+++ b/include/nbl/video/IQueue.h
@@ -125,12 +125,7 @@ class IQueue : public core::Interface, public core::Unmovable
 		class DeferredSubmitCallback final
 		{
                 //
-                struct STLASBuildMetadata
-                {
-                    core::unordered_set<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> m_BLASes;
-                    uint32_t m_buildVer;
-                };
-                core::unordered_map<IGPUTopLevelAccelerationStructure*,STLASBuildMetadata> m_TLASToBLASReferenceSets;
+                core::unordered_map<IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_TLASBuilds;
                 //
                 using smart_ptr = core::smart_refctd_ptr<IBackendObject>;
                 core::smart_refctd_dynamic_array<smart_ptr> m_resources;
diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h
index 8041927fa2..4c0d67eee1 100644
--- a/src/nbl/video/CVulkanAccelerationStructure.h
+++ b/src/nbl/video/CVulkanAccelerationStructure.h
@@ -54,21 +54,6 @@ class CVulkanTopLevelAccelerationStructure final : public CVulkanAccelerationStr
 		using Base::Base;
 };
 
-
-//! all these utilities cannot be nested because of the complex inheritance between `IGPUAccelerationStructure` and the Vulkan classes
-inline VkCopyAccelerationStructureModeKHR getVkCopyAccelerationStructureModeFrom(const IGPUAccelerationStructure::COPY_MODE in)
-{
-	return static_cast<VkCopyAccelerationStructureModeKHR>(in);
-}
-inline VkCopyAccelerationStructureInfoKHR getVkCopyAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyInfo& copyInfo)
-{
-	VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
-	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.src->getNativeHandle());
-	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.dst->getNativeHandle());
-	info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode);
-	return info;
-}
-
 template<typename T>
 concept Buffer = is_any_of_v<std::remove_const_t<T>,IGPUBuffer,asset::ICPUBuffer>;
 
@@ -91,24 +76,6 @@ inline DeviceOrHostAddress<BufferType> getVkDeviceOrHostAddress(const asset::SBu
 	}
 	return addr;
 }
-template<Buffer BufferType>
-inline VkCopyAccelerationStructureToMemoryInfoKHR getVkCopyAccelerationStructureToMemoryInfoFrom(const IGPUAccelerationStructure::CopyToMemoryInfo<BufferType>& copyInfo)
-{
-	VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr };
-	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.src->getNativeHandle());
-	info.dst = getVkDeviceOrHostAddress<BufferType>(copyInfo.dst);
-	info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode);
-	return info;
-}
-template<Buffer BufferType>
-inline VkCopyMemoryToAccelerationStructureInfoKHR getVkCopyMemoryToAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyFromMemoryInfo<BufferType>& copyInfo)
-{
-	VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
-	info.src = getVkDeviceOrHostAddress<const BufferType>(copyInfo.src);
-	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.dst->getNativeHandle());
-	info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode);
-	return info;
-}
 
 inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS in)
 {
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index b569a5fde2..b53c3c1537 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -377,22 +377,31 @@ bool CVulkanCommandBuffer::copyImage_impl(const IGPUImage* const srcImage, const
 }
 
 
-bool CVulkanCommandBuffer::copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo)
+bool CVulkanCommandBuffer::copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact)
 {
-    const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo);
+	VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+	info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR;
     getFunctionTable().vkCmdCopyAccelerationStructureKHR(m_cmdbuf,&info);
     return true;
 }
-bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
+bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst)
 {
-    const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo);
+	VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = getVkDeviceOrHostAddress(dst);
+	info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR;
     getFunctionTable().vkCmdCopyAccelerationStructureToMemoryKHR(m_cmdbuf,&info);
     return true;
 }
 
-bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
+bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst)
 {
-    const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo);
+    VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+    info.src = getVkDeviceOrHostAddress(src);
+    info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+    info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR;
     getFunctionTable().vkCmdCopyMemoryToAccelerationStructureKHR(m_cmdbuf,&info);
     return true;
 }
diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h
index 634d8c4f2b..f31a79387d 100644
--- a/src/nbl/video/CVulkanCommandBuffer.h
+++ b/src/nbl/video/CVulkanCommandBuffer.h
@@ -177,9 +177,9 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer
             return true;
         }
 
-        bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) override;
-        bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) override;
-        bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) override;
+        bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact);
+        bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst);
+        bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst);
 
         bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) override;
         bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) override;
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 2e30a18269..b27760699c 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -499,21 +499,30 @@ bool CVulkanLogicalDevice::writeAccelerationStructuresProperties_impl(const std:
     return m_devf.vk.vkWriteAccelerationStructuresPropertiesKHR(m_vkdev,vk_accelerationStructures.size(),vk_accelerationStructures.data(),static_cast<VkQueryType>(type),stride*accelerationStructures.size(),data,stride);
 }
 
-auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) -> DEFERRABLE_RESULT
+auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) -> DEFERRABLE_RESULT
 {
-    const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo);
+	VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+	info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR;
     return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),&info));
 }
 
-auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) -> DEFERRABLE_RESULT
+auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding<asset::ICPUBuffer>& dst) -> DEFERRABLE_RESULT
 {
-    const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo);
+	VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = getVkDeviceOrHostAddress(dst);
+	info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR;
     return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureToMemoryKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),&info));
 }
 
-auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) -> DEFERRABLE_RESULT
+auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding<const asset::ICPUBuffer>& src, IGPUAccelerationStructure* dst) -> DEFERRABLE_RESULT
 {
-    const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo);
+    VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+    info.src = getVkDeviceOrHostAddress(src);
+    info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+    info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR;
     return getDeferrableResultFrom(m_devf.vk.vkCopyMemoryToAccelerationStructureKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),&info));
 }
 
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 0c5666fae5..06f95a4fc5 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -261,9 +261,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice
             return getDeferrableResultFrom(m_devf.vk.vkBuildAccelerationStructuresKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),infoCount,vk_buildGeomsInfos.data(),vk_ppBuildRangeInfos));
         }
         bool writeAccelerationStructuresProperties_impl(const std::span<const IGPUAccelerationStructure* const> accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) override;
-        DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) override;
-        DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) override;
-        DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) override;
+        DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) override;
+        DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding<asset::ICPUBuffer>& dst) override;
+        DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding<const asset::ICPUBuffer>& src, IGPUAccelerationStructure* dst) override;
 
         // shaders
         core::smart_refctd_ptr<IGPUShader> createShader_impl(const asset::ICPUShader* spirvShader) override;
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 6bde593097..5d3c889798 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -864,8 +864,8 @@ template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopL
     const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo>, IGPUTopLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const
 );
 
-
-bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo)
+template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
@@ -888,10 +888,18 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructur
     }
 
     m_noCommands = false;
-    return copyAccelerationStructure_impl(copyInfo);
+    const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact);
+    if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
+    {
+//        if (copyInfo.buildVer)
+    }
+    return retval;
 }
+template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
+template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::CopyInfo&);
 
-bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
+template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
@@ -911,10 +919,17 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAcceleration
     }
 
     m_noCommands = false;
-    return copyAccelerationStructureToMemory_impl(copyInfo);
+    const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst);
+    if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
+    {
+    }
+    return retval;
 }
+template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
 
-bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
+template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
@@ -934,8 +949,14 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerati
     }
 
     m_noCommands = false;
-    return copyAccelerationStructureFromMemory_impl(copyInfo);
+    const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst);
+    if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
+    {
+    }
+    return retval;
 }
+template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
 
 
 bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline)
@@ -2078,22 +2099,18 @@ bool IGPUCommandBuffer::executeCommands(const uint32_t count, IGPUCommandBuffer*
     return executeCommands_impl(count,cmdbufs);
 }
 
-bool IGPUCommandBuffer::recordReferences(const std::span<const IReferenceCounted*> refs)
+core::smart_refctd_ptr<const core::IReferenceCounted>* IGPUCommandBuffer::reserveReferences(const uint32_t size)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT|queue_flags_t::SPARSE_BINDING_BIT))
-        return false;
+        return nullptr;
     
-    auto cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CCustomReferenceCmd>(m_commandList,refs.size());
+    auto cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CCustomReferenceCmd>(m_commandList,size);
     if (!cmd)
     {
         NBL_LOG_ERROR("out of host memory!");
-        return false;
+        return nullptr;
     }
-    auto oit = cmd->getVariableCountResources();
-    for (const auto& ref : refs)
-        *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(ref);
-
-    return true;
+    return cmd->getVariableCountResources();
 }
 
 }
\ No newline at end of file
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index e7612cc8d1..f5a4130825 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -156,12 +156,20 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
         for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets)
         {
             const auto tlas = refSet.first;
+            using iterator = decltype(refSet.second)::iterator;
+            struct CustomIterator
+            {
+                inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;}
+
+                inline CustomIterator operator++() {return {ptr++};}
+
+                inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
+
+                iterator ptr;
+            };
+            const auto buildVer = tlas->pushTrackedBLASes<CustomIterator>({refSet.second.begin()},{refSet.second.end()});
             // in theory could assert no duplicate entries, but thats obvious
-            auto& out = m_TLASToBLASReferenceSets[tlas];
-            out.m_BLASes.reserve(refSet.second.size());
-            for (const auto& refCtd : refSet.second)
-                out.m_BLASes.emplace(dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(refCtd.get()));
-            out.m_buildVer = tlas->registerNextBuildVer();
+            m_TLASBuilds[tlas] = buildVer;
         }
     }
     // We don't hold the last signal semaphore, because the timeline does as an Event trigger.
@@ -174,10 +182,10 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
 
 IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(DeferredSubmitCallback&& other)
 {
-    m_TLASToBLASReferenceSets = std::move(other.m_TLASToBLASReferenceSets);
+    m_TLASBuilds = std::move(other.m_TLASBuilds);
     m_resources = std::move(other.m_resources);
     m_callback = std::move(other.m_callback);
-    other.m_TLASToBLASReferenceSets = {};
+    other.m_TLASBuilds.clear();
     other.m_resources = nullptr;
     other.m_callback = {};
 	return *this;
@@ -186,13 +194,9 @@ IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(Deferr
 // always exhaustive poll, because we need to get rid of resources ASAP
 void IQueue::DeferredSubmitCallback::operator()()
 {
-    // first update tracking info (needs resources alive)
-    for (const auto& refSet : m_TLASToBLASReferenceSets)
-    {
-        const auto tlas = refSet.first;
-        const auto& blases = refSet.second.m_BLASes;
-        tlas->setTrackedBLASes(blases.begin(),blases.end(),refSet.second.m_buildVer);
-    }
+    // all builds started before ours will now get overwritten (not exactly true, but without a better tracking system, this is the best we can do for now)
+    for (const auto& build : m_TLASBuilds)
+        build.first->clearTrackedBLASes(build.second);
     // then free all resources
     m_resources = nullptr;
     // then execute the callback
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index c69d373656..4d09a31eac 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -5261,7 +5261,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								compactedAS->setObjectDebugName(debugName.c_str());
 							}
 							// record compaction
-							if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=srcAS,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT}))
+							if (!computeCmdBuf->cmdbuf->copyAccelerationStructure<AccelerationStructure>({.src=srcAS,.dst=compactedAS.get(),.compact=true}))
 							{
 								logFail("record Acceleration Structure compaction",compactedAS.get());
 								continue;

From 0f42726948f4c389a84d0bc68ab84f82b377c987 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 12:04:27 +0200
Subject: [PATCH 181/346] implemented BLAS tracking for TLAS device-side copies

---
 include/nbl/video/IGPUAccelerationStructure.h | 13 ++--
 include/nbl/video/IGPUCommandBuffer.h         | 30 ++++++--
 src/nbl/video/IGPUCommandBuffer.cpp           | 22 +++---
 src/nbl/video/IQueue.cpp                      | 74 +++++++++++++++----
 4 files changed, 105 insertions(+), 34 deletions(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 68b4c1940b..1b851093e2 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -379,6 +379,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		//
 		inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;}
 
+		//
+		using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
+
 		// copies
 		struct CopyInfo
 		{
@@ -392,7 +395,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 			const IGPUTopLevelAccelerationStructure* src = nullptr;
 			asset::SBufferBinding<BufferType> dst = nullptr;
 			// [optional] Query the tracked BLASes
-			core::smart_refctd_dynamic_array<core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> trackedBLASes = nullptr;
+			core::smart_refctd_dynamic_array<blas_smart_ptr_t> trackedBLASes = nullptr;
 		};
 		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
 		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
@@ -693,8 +696,6 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		{
 			return ++m_pendingBuildVer;
 		}
-		// 
-		using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
 		// returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked`
 		inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const
 		{
@@ -703,10 +704,12 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 			// stop multiple threads messing with us
 			std::lock_guard lk(m_trackingLock);
 			auto pBLASes = getPendingBuildTrackedBLASes(buildVer);
+			const auto origCount = *count;
 			*count = pBLASes ? pBLASes->size():0;
 			if (!tracked || !pBLASes)
 				return;
-			for (auto it=pBLASes->begin(); it!=pBLASes->end(); it++)
+			auto it = pBLASes->begin();
+			for (auto i = 0; i<origCount; i++)
 				*(tracked++) = *(it++);
 		}
 		// Useful if TLAS got built externally as well
@@ -747,7 +750,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		const uint32_t m_maxInstanceCount;
 
 	private:
-		friend class IGPUCommandBuffer;
+		friend class IQueue;
 		inline const core::unordered_set<blas_smart_ptr_t>* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const
 		{
 			const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;});
diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index 98d98ab98a..e1e672e838 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -571,7 +571,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             auto oit = reserveReferences(size);
             if (oit)
             {
-                m_TLASToBLASReferenceSets[tlas] = {oit,size};
+                m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=tlas});
                 while (beginBLASes!=endBLASes)
                     *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(*(beginBLASes++));
             }
@@ -750,7 +750,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             m_state = STATE::INITIAL;
 
             m_boundDescriptorSetsRecord.clear();
-            m_TLASToBLASReferenceSets.clear();
+            m_TLASTrackingOps.clear();
             m_boundGraphicsPipeline= nullptr;
             m_boundComputePipeline= nullptr;
             m_boundRayTracingPipeline= nullptr;
@@ -768,7 +768,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         {
             deleteCommandList();
             m_boundDescriptorSetsRecord.clear();
-            m_TLASToBLASReferenceSets.clear();
+            m_TLASTrackingOps.clear();
             m_boundGraphicsPipeline= nullptr;
             m_boundComputePipeline= nullptr;
             m_boundRayTracingPipeline= nullptr;
@@ -909,10 +909,26 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         // or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT.
         core::unordered_map<const IGPUDescriptorSet*,uint64_t> m_boundDescriptorSetsRecord;
         
-        // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it.
-        // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes.
-        // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records.
-        core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const core::smart_refctd_ptr<const IReferenceCounted>>> m_TLASToBLASReferenceSets;
+        // If the user wants the builds and copies to be tracking, and make the TLAS remember the BLASes that have been built into it.
+        // The Command Pool already tracks resources referenced in the Build Infos or Copies From Memory (Deserializations), so we only need pointers into those records.
+        struct TLASTrackingWrite
+        {
+            std::span<const core::smart_refctd_ptr<const IReferenceCounted>> src;
+            IGPUTopLevelAccelerationStructure* dst;
+        };
+        struct TLASTrackingCopy
+        {
+            const IGPUTopLevelAccelerationStructure* src;
+            IGPUTopLevelAccelerationStructure* dst;
+        };
+        struct TLASTrackingRead
+        {
+            const IGPUTopLevelAccelerationStructure* src;
+            // For a copy to memory (Serialization), we need to dump the BLASes references
+            core::smart_refctd_dynamic_array<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> dst;
+        };
+        // operations as they'll be performed in order
+        core::vector<std::variant<TLASTrackingWrite,TLASTrackingCopy,TLASTrackingRead>> m_TLASTrackingOps;
 
         const IGPUGraphicsPipeline* m_boundGraphicsPipeline;
         const IGPUComputePipeline* m_boundComputePipeline;
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 5d3c889798..40c5ea1e3b 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -842,10 +842,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
         if constexpr (std::is_same_v<DeviceBuildInfo,IGPUTopLevelAccelerationStructure::DeviceBuildInfo>)
         {
             const auto blasCount = info.trackedBLASes.size();
-            if (blasCount)
-                m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
-            else
-                m_TLASToBLASReferenceSets[info.dstAS] = {};
+            m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit-blasCount,blasCount},.dst=info.dstAS});
         }
     }
 
@@ -890,9 +887,7 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::C
     m_noCommands = false;
     const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact);
     if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
-    {
-//        if (copyInfo.buildVer)
-    }
+        m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst});
     return retval;
 }
 template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
@@ -921,8 +916,7 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStru
     m_noCommands = false;
     const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst);
     if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
-    {
-    }
+        m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes});
     return retval;
 }
 template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
@@ -952,6 +946,16 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationSt
     const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst);
     if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
     {
+        const auto size = copyInfo.trackedBLASes.size();
+        auto oit = reserveReferences(size);
+        if (oit)
+        {
+            m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=copyInfo.dst});
+            for (const auto& blas : copyInfo.trackedBLASes)
+                *(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(blas);
+        }
+        else
+            NBL_LOG_ERROR("out of host memory for BLAS tracking references, TLAS will be copied from memory without BLAS tracking data!");
     }
     return retval;
 }
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index f5a4130825..256233dc91 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -149,27 +149,75 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
     auto outRes = m_resources->data();
     for (const auto& sema : info.waitSemaphores)
         *(outRes++) = smart_ptr(sema.semaphore);
+    // track our own versions
+    core::unordered_map<const IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_readTLASVersions;
+    // get the TLAS BLAS tracking info and assign a pending build version number
+    for (const auto& cb : info.commandBuffers)
+    for (const auto& var : cb.cmdbuf->m_TLASTrackingOps)
+    {
+        const IGPUTopLevelAccelerationStructure* src = nullptr;
+        switch (var.index())
+        {
+            case 1:
+                src = std::get<1>(var).src;
+                break;
+            case 2:
+                src = std::get<2>(var).src;
+                break;
+        }
+        if (src)
+            m_readTLASVersions.insert({src,src->getPendingBuildVer()});
+    }
     for (const auto& cb : info.commandBuffers)
     {
         *(outRes++) = smart_ptr(cb.cmdbuf);
-        // get the TLAS BLAS tracking info and assign a pending build version number
-        for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets)
+        for (const auto& var : cb.cmdbuf->m_TLASTrackingOps)
+        switch (var.index())
         {
-            const auto tlas = refSet.first;
-            using iterator = decltype(refSet.second)::iterator;
-            struct CustomIterator
+            case 0:
             {
-                inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;}
+                const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var);
+                using iterator = decltype(op.src)::iterator;
+                struct CustomIterator
+                {
+                    inline bool operator!=(const CustomIterator& other) const { return ptr != other.ptr; }
 
-                inline CustomIterator operator++() {return {ptr++};}
+                    inline CustomIterator operator++() { return { ptr++ }; }
 
-                inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
+                    inline const IGPUBottomLevelAccelerationStructure* operator*() const { return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get()); }
 
-                iterator ptr;
-            };
-            const auto buildVer = tlas->pushTrackedBLASes<CustomIterator>({refSet.second.begin()},{refSet.second.end()});
-            // in theory could assert no duplicate entries, but thats obvious
-            m_TLASBuilds[tlas] = buildVer;
+                    iterator ptr;
+                };
+                m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes<CustomIterator>({op.src.begin()},{op.src.end()});
+                break;
+            }
+            case 1:
+            {
+                const IGPUCommandBuffer::TLASTrackingCopy& op = std::get<1>(var);
+                // not sure if even legal, but it would deadlock us
+                if (op.src==op.dst)
+                    break;
+                const auto ver = m_readTLASVersions.find(op.src)->second;
+                // stop multiple threads messing with us
+                std::lock_guard lk(op.src->m_trackingLock);
+                const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver);
+                assert(pSrcBLASes);
+                m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end());
+                break;
+            }
+            case 2:
+            {
+                const IGPUCommandBuffer::TLASTrackingRead& op = std::get<2>(var);
+                const auto ver = m_readTLASVersions.find(op.src)->second;
+                uint32_t count = op.dst->size();
+                op.src->getPendingBuildTrackedBLASes(&count,op.dst->data(),ver);
+                if (count>op.dst->size())
+                    cb.cmdbuf->getOriginDevice()->getLogger()->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,op.dst->size());
+                break;
+            }
+            default:
+                assert(false);
+                break;
         }
     }
     // We don't hold the last signal semaphore, because the timeline does as an Event trigger.

From 302710fd4a5255b84f6e495b6e0dd398a8b45296 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 12:53:17 +0200
Subject: [PATCH 182/346] clean up a bit and implement BLAS tracking info for
 Host Copies

---
 include/nbl/video/IGPUAccelerationStructure.h |  11 ++
 include/nbl/video/ILogicalDevice.h            | 101 +++++++++++++++---
 include/nbl/video/IQueue.h                    |   2 +-
 src/nbl/video/IQueue.cpp                      |  23 ++--
 4 files changed, 104 insertions(+), 33 deletions(-)

diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 1b851093e2..1bb4fb0c66 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -750,6 +750,17 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		const uint32_t m_maxInstanceCount;
 
 	private:
+		struct DynamicUpCastingSpanIterator
+		{
+			inline bool operator!=(const DynamicUpCastingSpanIterator& other) const {return ptr!=other.ptr;}
+
+			inline DynamicUpCastingSpanIterator operator++() {return {ptr++};}
+
+			inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
+
+			std::span<const core::smart_refctd_ptr<const core::IReferenceCounted>>::iterator ptr;
+		};
+		friend class ILogicalDevice;
 		friend class IQueue;
 		inline const core::unordered_set<blas_smart_ptr_t>* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const
 		{
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 0e36c9ace1..34036e2ffc 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -593,18 +593,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                             auto tlas = set.first;
                             // we know the build is completed immediately after performing it, so we get our pending stamp then
                             // ideally we should get our build version when the work of the deferred op gets executed for the first time
-                            using iterator = decltype(set.second)::iterator;
-                            struct CustomIterator
-                            {
-                                inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;}
-
-                                inline CustomIterator operator++() {return {ptr++};}
-
-                                inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
-
-                                iterator ptr;
-                            };
-                            const auto buildVer = tlas->pushTrackedBLASes<CustomIterator>({set.second.begin()},{set.second.end()});
+                            const auto buildVer = tlas->pushTrackedBLASes<IGPUTopLevelAccelerationStructure::DynamicUpCastingSpanIterator>({set.second.begin()},{set.second.end()});
                             tlas->clearTrackedBLASes(buildVer);
                         }
                     }
@@ -622,10 +611,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     if constexpr (IsTLAS)
                     {
                         const auto blasCount = info.trackedBLASes.size();
-                        if (blasCount)
-                            callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
-                        else
-                            callback.m_TLASToBLASReferenceSets[info.dstAS] = {};
+                        callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
                     }
                 }
                 if constexpr (IsTLAS)
@@ -685,10 +671,42 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             auto result = copyAccelerationStructure_impl(deferredOperation,copyInfo);
             if (result==DEFERRABLE_RESULT::DEFERRED)
+            {
                 deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.src),
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.dst)
                 });
+                constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+                if constexpr (IsTLAS)
+                {
+                    struct TLASCallback
+                    {
+                        // upon completion set the BLASes tracked
+                        inline void operator()(IDeferredOperation*) const
+                        {
+                            // not sure if even legal, but it would deadlock us
+                            if (src==dst)
+                                return;
+                            uint32_t buildVer;
+                            {
+                                // stop multiple threads messing with us
+                                std::lock_guard lk(src->m_trackingLock);
+                                // we know the build is completed immediately after performing it, so we get our pending stamp then
+                                // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time
+                                const auto* pSrcBLASes = src->getPendingBuildTrackedBLASes(src->getPendingBuildVer());
+                                const std::span<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> emptySpan = {};
+                                buildVer = pSrcBLASes ? dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end());
+                            }
+                            dst->clearTrackedBLASes(buildVer);
+                        }
+
+                        // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
+                        const IGPUTopLevelAccelerationStructure* src;
+                        IGPUTopLevelAccelerationStructure* dst;
+                    } callback = {.src=copyInfo.src,.dst=copyInfo.dst};
+                    deferredOperation->m_callback = std::move(callback);
+                }
+            }
             
 
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
@@ -713,10 +731,39 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             auto result = copyAccelerationStructureToMemory_impl(deferredOperation,copyInfo);
             if (result==DEFERRABLE_RESULT::DEFERRED)
+            {
                 deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.src),
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.dst.buffer)
                 });
+                constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+                if constexpr (IsTLAS)
+                {
+                    struct TLASCallback
+                    {
+                        // upon completion set the BLASes tracked
+                        inline void operator()(IDeferredOperation*) const
+                        {
+                            // stop multiple threads messing with us
+                            std::lock_guard lk(src->m_trackingLock);
+                            // we know the build is completed immediately after performing it, so we get our pending stamp then
+                            // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time
+                            const auto ver = src->getPendingBuildVer();
+                            uint32_t count = dst->size();
+                            src->getPendingBuildTrackedBLASes(&count,dst->data(),ver);
+                            if (count>dst->size())
+                                logger->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,dst->size());
+                        }
+
+                        // device keeps it alive for entire lifetime of the callback
+                        system::ILogger* logger;
+                        // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
+                        const IGPUTopLevelAccelerationStructure* src;
+                        core::smart_refctd_dynamic_array<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> dst;
+                    } callback = {.logger=m_logger.get(),.src=copyInfo.src,.dst=copyInfo.trackedBLASes};
+                    deferredOperation->m_callback = std::move(callback);
+                }
+            }
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
         template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
@@ -739,10 +786,32 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             auto result = copyAccelerationStructureFromMemory_impl(deferredOperation,copyInfo);
             if (result==DEFERRABLE_RESULT::DEFERRED)
+            {
                 deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.src.buffer),
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.dst)
                 });
+                constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+                if constexpr (IsTLAS)
+                {
+                    const size_t offset = deferredOperation->m_resourceTracking.size();
+                    deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.end(),copyInfo.trackedBLASes.begin(),copyInfo.trackedBLASes.end());
+                    struct TLASCallback
+                    {
+                        // upon completion set the BLASes tracked
+                        inline void operator()(IDeferredOperation*) const
+                        {
+                            const auto buildVer = dst->pushTrackedBLASes<IGPUTopLevelAccelerationStructure::DynamicUpCastingSpanIterator>({src->begin()},{src->end()});
+                            dst->clearTrackedBLASes(buildVer);
+                        }
+
+                        // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
+                        std::span<const core::smart_refctd_ptr<const IReferenceCounted>> src;
+                        IGPUTopLevelAccelerationStructure* dst;
+                    } callback = {.src={deferredOperation->m_resourceTracking.data()+offset,copyInfo.trackedBLASes.size()},.dst=copyInfo.dst};
+                    deferredOperation->m_callback = std::move(callback);
+                }
+            }
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
 
diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h
index c52e30517f..63073beb33 100644
--- a/include/nbl/video/IQueue.h
+++ b/include/nbl/video/IQueue.h
@@ -125,7 +125,7 @@ class IQueue : public core::Interface, public core::Unmovable
 		class DeferredSubmitCallback final
 		{
                 //
-                core::unordered_map<IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_TLASBuilds;
+                core::unordered_map<IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_TLASOverwrites;
                 //
                 using smart_ptr = core::smart_refctd_ptr<IBackendObject>;
                 core::smart_refctd_dynamic_array<smart_ptr> m_resources;
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index 256233dc91..108f76183c 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -177,18 +177,9 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
             case 0:
             {
                 const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var);
-                using iterator = decltype(op.src)::iterator;
-                struct CustomIterator
-                {
-                    inline bool operator!=(const CustomIterator& other) const { return ptr != other.ptr; }
-
-                    inline CustomIterator operator++() { return { ptr++ }; }
 
-                    inline const IGPUBottomLevelAccelerationStructure* operator*() const { return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get()); }
-
-                    iterator ptr;
-                };
-                m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes<CustomIterator>({op.src.begin()},{op.src.end()});
+                using iterator = decltype(op.src)::iterator;
+                m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = op.dst->pushTrackedBLASes<IGPUTopLevelAccelerationStructure::DynamicUpCastingSpanIterator>({op.src.begin()},{op.src.end()});
                 break;
             }
             case 1:
@@ -201,8 +192,8 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
                 // stop multiple threads messing with us
                 std::lock_guard lk(op.src->m_trackingLock);
                 const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver);
-                assert(pSrcBLASes);
-                m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end());
+                const std::span<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> emptySpan = {};
+                m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = pSrcBLASes ? op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):op.dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end());
                 break;
             }
             case 2:
@@ -230,10 +221,10 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
 
 IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(DeferredSubmitCallback&& other)
 {
-    m_TLASBuilds = std::move(other.m_TLASBuilds);
+    m_TLASOverwrites = std::move(other.m_TLASOverwrites);
     m_resources = std::move(other.m_resources);
     m_callback = std::move(other.m_callback);
-    other.m_TLASBuilds.clear();
+    other.m_TLASOverwrites.clear();
     other.m_resources = nullptr;
     other.m_callback = {};
 	return *this;
@@ -243,7 +234,7 @@ IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(Deferr
 void IQueue::DeferredSubmitCallback::operator()()
 {
     // all builds started before ours will now get overwritten (not exactly true, but without a better tracking system, this is the best we can do for now)
-    for (const auto& build : m_TLASBuilds)
+    for (const auto& build : m_TLASOverwrites)
         build.first->clearTrackedBLASes(build.second);
     // then free all resources
     m_resources = nullptr;

From 5813d1067919568c86ca8203fa8760237e82f381 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 13:59:50 +0200
Subject: [PATCH 183/346] finish const correctness of Descriptor Set Layouts in
 Pipelines

---
 include/nbl/video/ILogicalDevice.h     |  8 ++++----
 src/nbl/video/CVulkanLogicalDevice.cpp | 10 +++++-----
 src/nbl/video/CVulkanLogicalDevice.h   |  4 ++--
 src/nbl/video/CVulkanPipelineLayout.h  |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index c2f2605d0b..c84461ef9f 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -837,8 +837,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         // Create a pipeline layout (@see ICPUPipelineLayout)
         core::smart_refctd_ptr<IGPUPipelineLayout> createPipelineLayout(
             const std::span<const asset::SPushConstantRange> pcRanges={},
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0=nullptr, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1=nullptr,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2=nullptr, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3=nullptr
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0=nullptr, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1=nullptr,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2=nullptr, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3=nullptr
         )
         {
             if ((_layout0 && !_layout0->wasCreatedBy(this)))
@@ -1217,8 +1217,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual core::smart_refctd_ptr<IGPUDescriptorSetLayout> createDescriptorSetLayout_impl(const std::span<const IGPUDescriptorSetLayout::SBinding> bindings, const uint32_t maxSamplersCount) = 0;
         virtual core::smart_refctd_ptr<IGPUPipelineLayout> createPipelineLayout_impl(
             const std::span<const asset::SPushConstantRange> pcRanges,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3
         ) = 0;
 
         virtual core::smart_refctd_ptr<IDescriptorPool> createDescriptorPool_impl(const IDescriptorPool::SCreateInfo& createInfo) = 0;
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index b27760699c..bb2d6d6cb4 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -597,13 +597,13 @@ core::smart_refctd_ptr<IGPUDescriptorSetLayout> CVulkanLogicalDevice::createDesc
 
 core::smart_refctd_ptr<IGPUPipelineLayout> CVulkanLogicalDevice::createPipelineLayout_impl(
     const std::span<const asset::SPushConstantRange> pcRanges,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout0,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout1,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout2,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout3
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout0,
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout1,
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout2,
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout3
 )
 {
-    const core::smart_refctd_ptr<IGPUDescriptorSetLayout> tmp[] = { layout0, layout1, layout2, layout3 };
+    const core::smart_refctd_ptr<const IGPUDescriptorSetLayout> tmp[] = { layout0, layout1, layout2, layout3 };
 
     VkDescriptorSetLayout vk_dsLayouts[asset::ICPUPipelineLayout::DESCRIPTOR_SET_COUNT];
     uint32_t nonNullSetLayoutCount = ~0u;
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 06f95a4fc5..6386bdfa7c 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -272,8 +272,8 @@ class CVulkanLogicalDevice final : public ILogicalDevice
         core::smart_refctd_ptr<IGPUDescriptorSetLayout> createDescriptorSetLayout_impl(const std::span<const IGPUDescriptorSetLayout::SBinding> bindings, const uint32_t maxSamplersCount) override;
         core::smart_refctd_ptr<IGPUPipelineLayout> createPipelineLayout_impl(
             const std::span<const asset::SPushConstantRange> pcRanges,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3
         ) override;
 
         // descriptor sets
diff --git a/src/nbl/video/CVulkanPipelineLayout.h b/src/nbl/video/CVulkanPipelineLayout.h
index d89d2a493c..ef46226fdb 100644
--- a/src/nbl/video/CVulkanPipelineLayout.h
+++ b/src/nbl/video/CVulkanPipelineLayout.h
@@ -15,8 +15,8 @@ class CVulkanPipelineLayout : public IGPUPipelineLayout
     public:
         CVulkanPipelineLayout(
             const ILogicalDevice* dev, const std::span<const asset::SPushConstantRange> _pcRanges,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3,
             const VkPipelineLayout vk_layout
         ) : IGPUPipelineLayout(
                 core::smart_refctd_ptr<const ILogicalDevice>(dev),

From e4487ba3d92735c0a7bb587e23e87ef03607c2ea Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 14:17:53 +0200
Subject: [PATCH 184/346] small lifetime issue fix

---
 src/nbl/video/utilities/CAssetConverter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 4d09a31eac..548c049bfe 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1955,7 +1955,6 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			lastElement = element;
 			//
 			auto& outInfo = infos.emplace_back();
-			outInfo.desc = std::move(depObj);
 			// extra stuff
 			auto argTuple = std::tuple<const ExtraArgs&...>(extraArgs...);
 			if constexpr (std::is_same_v<DepType,ICPUBuffer>)
@@ -1985,6 +1984,7 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 					lastCombinedSampler = nullptr; // for debuggability
 				}
 			}
+			outInfo.desc = std::move(depObj);
 			return true;
 		}
 };

From 5dab08926bcb5fb7737c610a748a8b3a55ff77a6 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 14:31:42 +0200
Subject: [PATCH 185/346] fix device_jit_traits generation

---
 src/nbl/device/gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py
index 253d529b3d..88174cb3c2 100644
--- a/src/nbl/device/gen.py
+++ b/src/nbl/device/gen.py
@@ -120,7 +120,7 @@
         args.jit_traits_output_path,
         buildTraitsHeader,
         type="JIT Members",
-        template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{});",
+        template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{}) << \";\\n\";",
         limits_json=limits,
         features_json=features,
         format_params=["type", "name", "type", "json_type", "cpp_name"],

From ad96f8abf35e2face03e81148e90cafae91d25df Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 14:33:44 +0200
Subject: [PATCH 186/346] pre merge submodule update

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 06bf814d56..69ba991ea4 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 06bf814d56648d1468256f5231f2b772a5bd3263
+Subproject commit 69ba991ea4827c80d008a31256785f4c4c60f12d

From d042f42597fb6e12f9be04bb045145934de09d08 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 20:26:30 +0700
Subject: [PATCH 187/346] Add some utility function to IGPURayTracingPipeline
 SShaderGroup

---
 include/nbl/video/IGPURayTracingPipeline.h | 28 ++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index f7a92252f7..66e3a01072 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -38,6 +38,34 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                     return 1 + hits.size() + misses.size() + callables.size();
                 }
 
+                inline uint32_t getMissShaderCount() const
+                {
+                    auto count = 0; 
+                    for (const auto& miss : misses)
+                        count += (miss.shader != nullptr);
+                    return count;
+                }
+
+                inline uint32_t getHitShaderCount() const
+                {
+                    auto count = 0; 
+                    for (const auto& hit : hits)
+                    {
+                        count += (hit.closestHit.shader != nullptr);
+                        count += (hit.anyHit.shader != nullptr);
+                        count += (hit.intersection.shader != nullptr);
+                    }
+                    return count;
+                }
+
+                inline uint32_t getCallableShaderCount() const
+                {
+                    auto count = 0; 
+                    for (const auto& callable : callables)
+                        count += (callable.shader != nullptr ? 1 : 0);
+                    return count;
+                }
+
             };
 
             IGPUPipelineLayout* layout = nullptr;

From c4de7c23f6b4cbbaf57c133f3c8f7f068f606bc0 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 20:26:47 +0700
Subject: [PATCH 188/346] Fix debloat logic in logical device

---
 include/nbl/video/ILogicalDevice.h |  10 +-
 src/nbl/video/ILogicalDevice.cpp   | 177 +++++++++++++++++++++++++----
 2 files changed, 158 insertions(+), 29 deletions(-)

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 49364f3a54..ab0d5bea06 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -1097,7 +1097,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual core::smart_refctd_ptr<IGPUFramebuffer> createFramebuffer_impl(IGPUFramebuffer::SCreationParams&& params) = 0;
 
         template<typename CreationParams, typename ExtraLambda>
-        inline CreationParams::SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const CreationParams> params, ExtraLambda&& extra)
+        inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const CreationParams> params, ExtraLambda&& extra)
         {
             if (pipelineCache && !pipelineCache->wasCreatedBy(this))
             {
@@ -1110,7 +1110,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 return {};
             }
 
-            typename CreationParams::SSpecializationValidationResult retval = {.count=0,.dataSize=0};
+            SSpecializationValidationResult retval = {.count=0,.dataSize=0};
             for (auto i=0; i<params.size(); i++)
             {
                 const auto& ci = params[i];
@@ -1222,19 +1222,19 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUComputePipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPUComputePipeline>* const output,
-            const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) = 0;
         virtual void createGraphicsPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-            const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) = 0;
         virtual void createRayTracingPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPURayTracingPipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPURayTracingPipeline>* const output,
-            const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) = 0;
 
         virtual core::smart_refctd_ptr<IQueryPool> createQueryPool_impl(const IQueryPool::SCreationParams& params) = 0;
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 26cfc4c6a8..d43ef7c58c 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -7,11 +7,70 @@
 using namespace nbl;
 using namespace nbl::video;
 
-static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span<const asset::IPipelineBase::SShaderSpecInfo> shaderSpecs, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders, asset::IPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr)
+class SpirvDebloatTask
+{
+    public:
+      using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
+
+        SpirvDebloatTask(asset::ISPIRVDebloater* debloater, system::logger_opt_ptr logger) : m_debloater(debloater), m_logger(logger)
+        {
+          
+        }
+
+        void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage)
+        {
+            const auto* shader = shaderSpec.shader;
+            auto it = m_entryPointsMap.find(shader);
+            if (it == m_entryPointsMap.end() || it->first != shader)
+                it = m_entryPointsMap.emplace_hint(it, shader, EntryPoints());
+            it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage });
+        }
+
+        IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders)
+        {
+            const auto* shader = shaderSpec.shader;
+            const auto& entryPoints = m_entryPointsMap[shader];
+
+            auto debloatedShaderSpec = shaderSpec;
+            if (shader != nullptr)
+            {
+                if (!m_debloatedShadersMap.contains(shader))
+                {
+                    const auto outShadersData = outShaders.data();
+                    outShaders.push_back(m_debloater->debloat(shader, entryPoints, m_logger));
+                    assert(outShadersData == outShaders.data());
+                    m_debloatedShadersMap.emplace(shader, outShaders.back().get());
+                }
+                const auto debloatedShader = m_debloatedShadersMap[shader];
+                debloatedShaderSpec.shader = debloatedShader;
+            }
+            return debloatedShaderSpec;
+        }
+  
+    private:
+        core::map<const asset::IShader*, EntryPoints> m_entryPointsMap;
+        core::map<const asset::IShader*, const asset::IShader*> m_debloatedShadersMap;
+        asset::ISPIRVDebloater* m_debloater;
+        const system::logger_opt_ptr m_logger;
+};
+
+using DebloaterEntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
+static void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage, 
+  core::map<const asset::IShader*, DebloaterEntryPoints> entryPointsMap)
+{
+    const auto* shader = shaderSpec.shader;
+    auto it = entryPointsMap.find(shader);
+    if (it == entryPointsMap.end() || it->first != shader)
+        it = entryPointsMap.emplace_hint(it, shader, DebloaterEntryPoints());
+    it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage });
+};
+
+static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span<const IGPUPipelineBase::SShaderSpecInfo> shaderSpecs, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders, IGPUPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr)
 {
     using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
     core::map<const asset::IShader*, EntryPoints> entryPointsMap;
 
+
     // collect all entry points first before we debloat
     for (const auto& shaderSpec : shaderSpecs) {
         const auto* shader = shaderSpec.shader;
@@ -781,10 +840,10 @@ asset::ICPUPipelineCache::SCacheKey ILogicalDevice::getPipelineCacheKey() const
 bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const IGPUComputePipeline::SCreationParams> params, core::smart_refctd_ptr<IGPUComputePipeline>* const output)
 {
     std::fill_n(output,params.size(),nullptr);
-    IGPUComputePipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool
     {
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
-        if (info.requiredSubgroupSize>=asset::IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(info.stage))
+        if (info.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE))
         {
             NBL_LOG_ERROR("Invalid shader stage");
             return false;
@@ -808,7 +867,11 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
-        debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, &newParams[ix].shader, m_logger);
+        const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} };
+        debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger));
+        auto debloatedShaderSpec = ci.shader;
+        debloatedShaderSpec.shader = debloatedShaders.back().get();
+        newParams[ix].shader = debloatedShaderSpec;
     }
 
     createComputePipelines_impl(pipelineCache,newParams,output,specConstantValidation);
@@ -834,12 +897,10 @@ bool ILogicalDevice::createGraphicsPipelines(
 )
 {
     std::fill_n(output, params.size(), nullptr);
-    IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params,
-        [this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params,
+        [this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool
         {
-            if (info.stage != hlsl::ShaderStage::ESS_VERTEX)
-                return true;
-            return info.shader;
+            return info.shader != nullptr;
         }
     );
     if (!specConstantValidation)
@@ -858,9 +919,6 @@ bool ILogicalDevice::createGraphicsPipelines(
     core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
     debloatedShaders.reserve(shaderCount);
 
-    core::vector<asset::IPipelineBase::SShaderSpecInfo> debloatedShaderSpecs(shaderCount);
-    auto outShaderSpecs = debloatedShaderSpecs.data();
-
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
@@ -953,9 +1011,19 @@ bool ILogicalDevice::createGraphicsPipelines(
                 }
             }
         }
+
+        SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger);
+        debloatTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
+        debloatTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
+        debloatTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
+        debloatTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
+        debloatTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
         
-        newParams[ix].shaders = std::span(outShaderSpecs, ci.getShaders().size());
-        debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, outShaderSpecs, m_logger);
+        newParams[ix].vertexShader = debloatTask.debloat(ci.vertexShader, debloatedShaders);
+        newParams[ix].tesselationControlShader = debloatTask.debloat(ci.tesselationControlShader, debloatedShaders);
+        newParams[ix].tesselationEvaluationShader = debloatTask.debloat(ci.tesselationEvaluationShader, debloatedShaders);
+        newParams[ix].geometryShader = debloatTask.debloat(ci.geometryShader, debloatedShaders);
+        newParams[ix].fragmentShader = debloatTask.debloat(ci.fragmentShader, debloatedShaders);
     }
 
     createGraphicsPipelines_impl(pipelineCache, newParams, output, specConstantValidation);
@@ -980,7 +1048,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
   core::smart_refctd_ptr<IGPURayTracingPipeline>* const output)
 {
     std::fill_n(output,params.size(),nullptr);
-    IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool
     {
         return true;
     });
@@ -1028,15 +1096,43 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
     }
 
     core::vector<IGPURayTracingPipeline::SCreationParams> newParams(params.begin(), params.end());
-    const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    const auto raygenCount = params.size(); // assume every param have raygen
+    const auto missShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
     {
-        return sum + param.getShaders().size();
+        return sum + param.shaderGroups.getMissShaderCount();
     });
+    const auto hitShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.getHitShaderCount();
+    });
+    const auto callableShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.getCallableShaderCount();
+    });
+    const auto shaderCount = raygenCount + missShaderCount + hitShaderCount + callableShaderCount;
     core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
     debloatedShaders.reserve(shaderCount);
 
-    core::vector<asset::IPipelineBase::SShaderSpecInfo> debloatedShaderSpecs(shaderCount);
-    auto outShaderSpecs = debloatedShaderSpecs.data();
+    const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.misses.size();
+    });
+    const auto hitGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.hits.size();
+    });
+    const auto callableGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.callables.size();
+    });
+
+
+    core::vector<IGPUPipelineBase::SShaderSpecInfo> debloatedMissSpecs(missGroupCount);
+    auto debloatedMissSpecData = debloatedMissSpecs.data();
+    core::vector<IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams::SHitGroup> debloatedHitSpecs(hitGroupCount);
+    auto debloatedHitSpecData = debloatedHitSpecs.data();
+    core::vector<IGPUPipelineBase::SShaderSpecInfo> debloatedCallableSpecs(callableGroupCount);
+    auto debloatedCallableSpecData = debloatedCallableSpecs.data();
 
     const auto& limits = getPhysicalDeviceLimits();
     for (auto ix = 0u; ix < params.size(); ix++)
@@ -1050,14 +1146,47 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
             NBL_LOG_ERROR("Invalid maxRecursionDepth. maxRecursionDepth(%u) exceed the limits(%u)", param.cached.maxRecursionDepth, limits.maxRayRecursionDepth);
             return false;
         }
-        if (param.getShaders().empty())
+
+        SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger);
+        debloatTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN);
+        for (const auto& miss : param.shaderGroups.misses)
+            debloatTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS);
+        for (const auto& hit : param.shaderGroups.hits)
         {
-            NBL_LOG_ERROR("Pipeline must have at least one shader.");
-            return false;
+            debloatTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT);
+            debloatTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT);
+            debloatTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION);
+        }
+        for (const auto& callable : param.shaderGroups.callables)
+            debloatTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE);
+
+        newParams[ix] = param;
+        newParams[ix].shaderGroups.raygen = debloatTask.debloat(param.shaderGroups.raygen, debloatedShaders);
+
+        newParams[ix].shaderGroups.misses = { debloatedMissSpecData, param.shaderGroups.misses.size() };
+        for (const auto& miss: param.shaderGroups.misses)
+        {
+            *debloatedMissSpecData = debloatTask.debloat(miss, debloatedShaders);
+            debloatedMissSpecData++;
         }
 
-        newParams[ix].shaders = std::span(outShaderSpecs, param.getShaders().size());
-        debloatShaders(*m_spirvDebloater.get(), param.getShaders(), debloatedShaders, outShaderSpecs, m_logger);
+        newParams[ix].shaderGroups.hits = { debloatedHitSpecData, param.shaderGroups.hits.size() };
+        for (const auto& hit: param.shaderGroups.hits)
+        {
+            *debloatedHitSpecData = {
+                .closestHit = debloatTask.debloat(hit.closestHit, debloatedShaders),
+                .intersection = debloatTask.debloat(hit.intersection, debloatedShaders),
+                .anyHit = debloatTask.debloat(hit.anyHit, debloatedShaders),
+            };
+            debloatedHitSpecData++;
+        }
+
+        newParams[ix].shaderGroups.callables = { debloatedCallableSpecData, param.shaderGroups.callables.size() };
+        for (const auto& callable: param.shaderGroups.callables)
+        {
+            *debloatedCallableSpecData = debloatTask.debloat(callable, debloatedShaders);
+            debloatedCallableSpecData++;
+        }
     }
 
     createRayTracingPipelines_impl(pipelineCache, newParams,output,specConstantValidation);

From f1fe0899869a762e377751ec9e85b68cde83e7f9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 23 May 2025 20:57:12 +0700
Subject: [PATCH 189/346] Remove unused funciton in ILogicalDevice.cpp

---
 src/nbl/video/ILogicalDevice.cpp | 52 --------------------------------
 1 file changed, 52 deletions(-)

diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index d43ef7c58c..7714219836 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -54,58 +54,6 @@ class SpirvDebloatTask
         const system::logger_opt_ptr m_logger;
 };
 
-using DebloaterEntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
-static void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage, 
-  core::map<const asset::IShader*, DebloaterEntryPoints> entryPointsMap)
-{
-    const auto* shader = shaderSpec.shader;
-    auto it = entryPointsMap.find(shader);
-    if (it == entryPointsMap.end() || it->first != shader)
-        it = entryPointsMap.emplace_hint(it, shader, DebloaterEntryPoints());
-    it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage });
-};
-
-static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span<const IGPUPipelineBase::SShaderSpecInfo> shaderSpecs, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders, IGPUPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr)
-{
-    using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
-    core::map<const asset::IShader*, EntryPoints> entryPointsMap;
-
-
-    // collect all entry points first before we debloat
-    for (const auto& shaderSpec : shaderSpecs) {
-        const auto* shader = shaderSpec.shader;
-        auto it = entryPointsMap.find(shader);
-        if (it == entryPointsMap.end() || it->first != shader)
-            it = entryPointsMap.emplace_hint(it, shader, EntryPoints());
-        it->second.insert({ .name = shaderSpec.entryPoint, .stage = shaderSpec.stage });
-    }
-
-    core::map<const asset::IShader*, const asset::IShader*> debloatedShaders;
-    for (const auto& shaderSpec: shaderSpecs)
-    {
-        const auto* shader = shaderSpec.shader;
-        const auto& entryPoints = entryPointsMap[shader];
-
-        auto debloatedShaderSpec = shaderSpec;
-        if (shader != nullptr)
-        {
-            if (!debloatedShaders.contains(shader))
-            {
-                const auto outShadersData = outShaders.data();
-                outShaders.push_back(debloater.debloat(shader, entryPoints, logger));
-                assert(outShadersData == outShaders.data());
-                debloatedShaders.emplace(shader, outShaders.back().get());
-            }
-            const auto debloatedShader = debloatedShaders[shader];
-            debloatedShaderSpec.shader = debloatedShader;
-        }
-        *outShaderSpecInfos = debloatedShaderSpec;
-
-        outShaderSpecInfos++;
-    }
-
-}
-
 ILogicalDevice::ILogicalDevice(core::smart_refctd_ptr<const IAPIConnection>&& api, const IPhysicalDevice* const physicalDevice, const SCreationParams& params, const bool runningInRenderdoc)
     : m_api(api), m_physicalDevice(physicalDevice), m_enabledFeatures(params.featuresToEnable), m_compilerSet(params.compilerSet),
     m_logger(m_physicalDevice->getDebugCallback() ? m_physicalDevice->getDebugCallback()->getLogger() : nullptr),

From 89f499dde20b7e1ccaad32c0d2dbb3ba637433bc Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 25 May 2025 19:56:28 +0200
Subject: [PATCH 190/346] get the explicitly instantiated templated methods

---
 src/nbl/video/IGPUCommandBuffer.cpp | 32 ++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 40c5ea1e3b..bba06c424a 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -235,8 +235,8 @@ bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<ResourceBarrier>
     #endif // _NBL_DEBUG
     return false;
 }
-template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<asset::SMemoryBarrier>&) const;
-template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<IGPUCommandBuffer::SOwnershipTransferBarrier>&) const;
+template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<asset::SMemoryBarrier>&) const;
+template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<IGPUCommandBuffer::SOwnershipTransferBarrier>&) const;
 
 bool IGPUCommandBuffer::setEvent(IEvent* _event, const SEventDependencyInfo& depInfo)
 {
@@ -848,16 +848,16 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
 
     return totalGeometries;
 }
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
     const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo>, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const
 );
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const>(
     const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo>, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const
 );
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
     const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo>, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const
 );
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::MaxInputCounts* const>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::MaxInputCounts* const>(
     const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo>, IGPUTopLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const
 );
 
@@ -890,8 +890,8 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::C
         m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst});
     return retval;
 }
-template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
-template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::CopyInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::CopyInfo&);
 
 template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
 bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
@@ -919,8 +919,8 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStru
         m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes});
     return retval;
 }
-template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
-template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
 
 template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
@@ -959,8 +959,8 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationSt
     }
     return retval;
 }
-template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
-template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
 
 
 bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline)
@@ -1686,8 +1686,8 @@ bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding<const IG
     }
     return false;
 }
-template bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
-template bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
 
 template<typename IndirectCommand> requires nbl::is_any_of_v<IndirectCommand,hlsl::DrawArraysIndirectCommand_t,hlsl::DrawElementsIndirectCommand_t>
 bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding<const IGPUBuffer>& indirectBinding, const asset::SBufferBinding<const IGPUBuffer>& countBinding, const uint32_t maxDrawCount, const uint32_t stride)
@@ -1705,8 +1705,8 @@ bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding<con
 
     return false;
 }
-template bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
-template bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
 
 bool IGPUCommandBuffer::drawIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride)
 {

From 499c10dbaf33ef3c9509b153404df937e1a67dee Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 26 May 2025 10:08:01 +0200
Subject: [PATCH 191/346] make asset converter work properly in absence of
 transfer SIntendedSubmitInfo but when compute calls are done/needed

---
 src/nbl/video/utilities/CAssetConverter.cpp | 44 ++++++++++++---------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 548c049bfe..4aa631c746 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -4037,7 +4037,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		
 		// whenever transfer needs to do a submit overflow because it ran out of memory for streaming, we can already submit the recorded compute shader dispatches
 		auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr;
-		auto drainCompute = [&params,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
+		auto drainCompute = [&params,shouldDoSomeTransfer,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
 		{
 			if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty())
 				return IQueue::RESULT::SUCCESS;
@@ -4045,15 +4045,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			auto& waitSemaphoreSpan = params.compute->waitSemaphores;
 			std::unique_ptr<IQueue::SSubmitInfo::SSemaphoreInfo[]> patchedWaits;
 			// the transfer scratch semaphore value, is from the last submit, not the future value we're enqueing all the deferred memory releases with
-			if (waitSemaphoreSpan.empty())
-				waitSemaphoreSpan = {&params.transfer->scratchSemaphore,1};
-			else
+			if (shouldDoSomeTransfer)
 			{
-				const auto origCount = waitSemaphoreSpan.size();
-				patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]);
-				std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get());
-				patchedWaits[origCount] = params.transfer->scratchSemaphore;
-				waitSemaphoreSpan = {patchedWaits.get(),origCount+1};
+				if (waitSemaphoreSpan.empty())
+					waitSemaphoreSpan = {&params.transfer->scratchSemaphore,1};
+				else
+				{
+					const auto origCount = waitSemaphoreSpan.size();
+					patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]);
+					std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get());
+					patchedWaits[origCount] = params.transfer->scratchSemaphore;
+					waitSemaphoreSpan = {patchedWaits.get(),origCount+1};
+				}
 			}
 			// don't worry about resetting old `waitSemaphores` because they get cleared to an empty span after overflow submit
             IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal);
@@ -4067,14 +4070,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		};
 
 		// We want to be doing Host operations while stalled for GPU, compose our overflow callback on top of what's already there, only if we need to ofc 
-		auto origXferStallCallback = params.transfer->overflowCallback;
-		params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void
+		std::function<void(const ISemaphore::SWaitInfo&)> origXferStallCallback;
+		if (shouldDoSomeTransfer)
 		{
-			drainCompute();
-			if (origXferStallCallback)
-				origXferStallCallback(tillScratchResettable);
-			hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;});
-		};
+			origXferStallCallback = std::move(params.transfer->overflowCallback);
+			params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void
+			{
+				drainCompute();
+				if (origXferStallCallback)
+					origXferStallCallback(tillScratchResettable);
+				hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;});
+			};
+		}
 		// when overflowing compute resources, we need to submit the Xfer before submitting Compute
 		auto drainBoth = [&params,&xferCmdBuf,&drainCompute](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
 		{
@@ -4149,7 +4156,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				return true;
 			};
 
-			// because of the layout transitions
+			// because of the layout transitions (TODO: conditional when host_image_copy gets implemented)
 			params.transfer->scratchSemaphore.stageMask |= PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
 // TODO:: Shall we rewrite? e.g. we upload everything first, extra submit for QFOT pipeline barrier & transition in overflow callback, then record compute commands, and submit them, plus their final QFOTs
 			// Lets analyze sync cases:
@@ -5337,7 +5344,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				retval.set({params.transfer->scratchSemaphore.semaphore,params.transfer->scratchSemaphore.value});
 		}
 		// reset original callback
-		params.transfer->overflowCallback = origXferStallCallback;
+		if (bool(origXferStallCallback))
+			params.transfer->overflowCallback = std::move(origXferStallCallback);
 		
 		// Its too dangerous to leave an Intended Transfer Submit hanging around that needs to be submitted for Compute to make forward progress outside of this utility,
 		// and doing transfer-signals-after-compute-wait timeline sema tricks are not and option because:

From 31e4e084291b7922b660866d8fe1307b8ff07ffa Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 26 May 2025 10:08:23 +0200
Subject: [PATCH 192/346] update examples_tests

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 69ba991ea4..e30938c261 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 69ba991ea4827c80d008a31256785f4c4c60f12d
+Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233

From 0e9d9323ccab52eebb70ddbc02e1ef03ab7bf76f Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 26 May 2025 10:24:30 +0200
Subject: [PATCH 193/346] save work

---
 .github/workflows/build-nabla.yml   |   4 +-
 docker/compiler-explorer            |   2 +-
 docker/msvc-winsdk                  |   2 +-
 src/nbl/device/gen.py               |   2 +-
 src/nbl/video/CJITIncludeLoader.cpp |   1 -
 tools/nsc/CMakeLists.txt            | 551 +++++++---------------------
 6 files changed, 136 insertions(+), 426 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 967953aeef..79b5d7aabb 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -82,8 +82,8 @@ jobs:
 
       - name: Package workflow artifacts
         run: |
-          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" -C profiling .
-          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" -C ${{ env.install }} .
+          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" profiling
+          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" ${{ env.install }}
 
       - name: Upload profiling artifacts
         uses: actions/upload-artifact@v4
diff --git a/docker/compiler-explorer b/docker/compiler-explorer
index e7d3e6ce85..45866dfa87 160000
--- a/docker/compiler-explorer
+++ b/docker/compiler-explorer
@@ -1 +1 @@
-Subproject commit e7d3e6ce85d4b87bd9afadc5b2ba8c268ccbeb51
+Subproject commit 45866dfa8782404fc121f25ce15ad0626b474db0
diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk
index 831515f599..d91a96faed 160000
--- a/docker/msvc-winsdk
+++ b/docker/msvc-winsdk
@@ -1 +1 @@
-Subproject commit 831515f59919fbe97653804a5fc634aeb36d360e
+Subproject commit d91a96faede2933ec02a18b94141fbed549929c0
diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py
index 253d529b3d..88174cb3c2 100644
--- a/src/nbl/device/gen.py
+++ b/src/nbl/device/gen.py
@@ -120,7 +120,7 @@
         args.jit_traits_output_path,
         buildTraitsHeader,
         type="JIT Members",
-        template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{});",
+        template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{}) << \";\\n\";",
         limits_json=limits,
         features_json=features,
         format_params=["type", "name", "type", "json_type", "cpp_name"],
diff --git a/src/nbl/video/CJITIncludeLoader.cpp b/src/nbl/video/CJITIncludeLoader.cpp
index a9f27e5afd..1fcbcb0505 100644
--- a/src/nbl/video/CJITIncludeLoader.cpp
+++ b/src/nbl/video/CJITIncludeLoader.cpp
@@ -20,7 +20,6 @@ auto CJITIncludeLoader::getInclude(const system::path& searchPath, const std::st
 std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& limits, const SPhysicalDeviceFeatures& features)
 {
     #include "nbl/video/device_capabilities_traits_jit.h"
-
     std::string start = R"===(
         #ifndef _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_
         #define _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index bb45442982..efe7741f4e 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -6,6 +6,7 @@ set(GODBOLT_BINARY_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/compiler-explorer")
 set(GODBOLT_BINARY_PRETEST_DIRECTORY "${GODBOLT_BINARY_DIRECTORY}/pre-test")
 set(NBL_NSC_COMPILE_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.compile/$<CONFIG>")
 set(NBL_NSC_PREINSTALL_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.preinstall")
+make_directory("${NBL_NSC_PREINSTALL_DIRECTORY}")
 
 set(NBL_DOCKER_CT_NSC_VOLUME_SOURCE "${GODBOLT_BINARY_DIRECTORY}/install")
 
@@ -56,323 +57,138 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST
 
 if(NBL_ENABLE_DOCKER_INTEGRATION)
 
-find_program(DOCKER_EXE
-	NAMES docker
-	REQUIRED
-)
+find_program(CTEST_EXE NAMES ctest REQUIRED)
+find_program(DOCKER_EXE NAMES docker REQUIRED)
+
+find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED)
+cmake_path(GET DXIL_DLL PARENT_PATH DXIL_DIR)
+cmake_path(NATIVE_PATH DXIL_DIR NORMALIZE DXIL_DIR)
+
+find_file(ICU_DLL NAMES icu.dll HINTS REQUIRED)
+cmake_path(GET ICU_DLL PARENT_PATH ICU_DIR)
+cmake_path(NATIVE_PATH ICU_DIR NORMALIZE ICU_DIR)
+set(ICU_GLOBALIZATION_DIR "C:/Windows/Globalization/ICU")
+find_file(ICUDTL_DAT NAMES icudtl.dat HINTS "${ICU_GLOBALIZATION_DIR}" REQUIRED)
+
+find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED)
+cmake_path(GET UCRTBASED_DLL PARENT_PATH UCRTBASED_DIR)
+cmake_path(NATIVE_PATH UCRTBASED_DIR NORMALIZE UCRTBASED_DIR)
+
+find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED)
+cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR)
+cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR)
+
+cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH)
 
-find_program(SPIRV_DIS_EXE
-	NAMES spirv-dis
-	HINTS "$ENV{VULKAN_SDK_INSTALL_DIRECTORY}/Bin"
-  HINTS "$ENV{VK_SDK_PATH}/Bin"
-  HINTS "$ENV{VULKAN_SDK}/Bin"
-	REQUIRED
+file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false
+  "${TOOLSET_REDIST_PATH}/x64/*.CRT/*.dll"
+  "${TOOLSET_REDIST_PATH}/debug_nonredist/x64/*.DebugCRT/*.dll"
 )
 
-cmake_path(GET Vulkan_INCLUDE_DIR PARENT_PATH VULKAN_SDK_INSTALL_DIRECTORY)
-get_filename_component(VULKAN_SDK_VERSION "${VULKAN_SDK_INSTALL_DIRECTORY}" NAME)
+foreach(MODULE ${VC_MODULES})
+  get_filename_component(DIR ${MODULE} DIRECTORY)
+  cmake_path(NATIVE_PATH DIR NORMALIZE DIR)
+  list(APPEND VC_MODULE_DIRS ${DIR})
+endforeach()
 
-if(NOT EXISTS "${VULKAN_SDK_INSTALL_DIRECTORY}")
-  message(FATAL_ERROR "Internal error, VULKAN_SDK_INSTALL_DIRECTORY doesn't exist")
+if(NOT VC_MODULE_DIRS)
+  message(FATAL_ERROR "Failed to GLOB for VC Redist modules!")
 endif()
 
-find_program(CTEST_EXE
-	NAMES ctest
-	REQUIRED
-)
+set(CT_RUNTIMES C:/pack/runtimes)
+cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES)
+set(HOST_MOUNT_DIRS ${VC_MODULE_DIRS} ${SPIRV_DIS_DIR} ${UCRTBASED_DIR} ${DXIL_DIR} ${ICU_DIR})
+list(REMOVE_DUPLICATES HOST_MOUNT_DIRS)
 
-set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake")
+set(ix 0)
+foreach(DIR ${HOST_MOUNT_DIRS})
+  set(TARGET_MOUNT_DIR "${CT_RUNTIMES}/system/${ix}")
+  cmake_path(NATIVE_PATH TARGET_MOUNT_DIR NORMALIZE TARGET_MOUNT_DIR)
 
-set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "C:\\\\nsc\\\\install")
-string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
-set(NSC_RELEASE_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}")
-set(NSC_RELWITHDEBINFO_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}")
-set(NSC_DEBUG_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}")
-cmake_path(NATIVE_PATH NSC_RELEASE_BUILD_INFO NORMALIZE NSC_RELEASE_BUILD_INFO)
-cmake_path(NATIVE_PATH NSC_RELWITHDEBINFO_BUILD_INFO NORMALIZE NSC_RELWITHDEBINFO_BUILD_INFO)
-cmake_path(NATIVE_PATH NSC_DEBUG_BUILD_INFO NORMALIZE NSC_DEBUG_BUILD_INFO)
-
-set(NBL_INSTALL_DIRECTORY "${NBL_DOCKER_CT_NSC_VOLUME_TARGET}")
-cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
+  list(APPEND DOCKER_CLI_ARGS -v "${DIR}:${TARGET_MOUNT_DIR}:ro")
+  list(APPEND CT_MOUNT_DIRS "${TARGET_MOUNT_DIR}")
+
+  math(EXPR ix "${ix} + 1" OUTPUT_FORMAT DECIMAL)
+endforeach()
 
+set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla")
 set(NBL_BUILD_INFO_POSTPROCESS_COMMAND
   "${CMAKE_COMMAND}"
   "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
   "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
   "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-  "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>" # as in CT, it's *not* host exe location!
+  "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>"
   -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
 )
+cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE)
+cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
+cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY)
+list(APPEND DOCKER_CLI_ARGS -v "${NBL_NSC_PREINSTALL_DIRECTORY}:${NBL_DOCKER_CT_NSC_VOLUME_TARGET}")
+#list(APPEND DOCKER_CLI_ARGS -v "${ICU_GLOBALIZATION_DIR}:${ICU_GLOBALIZATION_DIR}:ro")
 
-cmake_path(GET SPIRV_DIS_EXE PARENT_PATH VULKAN_SDK_BIN_DIRECTORY)
-cmake_path(NATIVE_PATH VULKAN_SDK_BIN_DIRECTORY NORMALIZE VULKAN_SDK_BIN_DIRECTORY)
-cmake_path(GET SPIRV_DIS_EXE FILENAME SPIRV_DIS_EXE)
-set(CT_SPIRV_DIS_EXE "C:\\vulkan\\${VULKAN_SDK_VERSION}\\bin\\${SPIRV_DIS_EXE}")
-cmake_path(NATIVE_PATH CT_SPIRV_DIS_EXE NORMALIZE CT_SPIRV_DIS_EXE)
-
+set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake")
+string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
+set(OUTPUT_CONFIG_FILE $<PATH:NORMAL_PATH,${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}>)
 set(NBL_CE_GENERATE_CONFIG_COMMAND
   "${CMAKE_COMMAND}"
-  "-DSPIRV_DIS_EXE=${CT_SPIRV_DIS_EXE}"
-  "-DNSC_RELEASE_BUILD_INFO=${NSC_RELEASE_BUILD_INFO}"
-  "-DNSC_RELWITHDEBINFO_BUILD_INFO=${NSC_RELWITHDEBINFO_BUILD_INFO}"
-  "-DNSC_DEBUG_BUILD_INFO=${NSC_DEBUG_BUILD_INFO}"
-  "-DOUTPUT_CONFIG_FILE=${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}"
+  "-DSPIRV_DIS_EXE=spirv-dis.exe"
+  "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
+  "-DNSC_RELWITHDEBINFO_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
+  "-DNSC_DEBUG_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
+  "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}"
   -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
 )
 
-set(NBL_DOCKER_CE_DOCKER_CTX "${NBL_ROOT_PATH}/docker/compiler-explorer")
-set(NBL_DOCKER_CE_DOCKERFILE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/Dockerfile")
-set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/compose.yml")
-cmake_path(NATIVE_PATH NBL_DOCKER_CE_COMPOSE_BASE NORMALIZE NBL_DOCKER_CE_COMPOSE_BASE)
-set(NBL_DOCKER_CE_COMPOSE_TARGET "${GODBOLT_BINARY_DIRECTORY}/.dev-compose.yml")
-
-include(InstallRequiredSystemLibraries)
-
-string(REPLACE "v" "VC" TARGET_DCRT ${CMAKE_VS_PLATFORM_TOOLSET})
-set(DEBUG_CRT_RELATIVE debug_nonredist/x64/Microsoft.${TARGET_DCRT}.DebugCRT)
-set(DEBUG_CRT_DIRECTORY_SOURCE "${MSVC_REDIST_DIR}/${DEBUG_CRT_RELATIVE}")
-cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE NBL_REDIST_DIR)
-
-if(NOT EXISTS "${DEBUG_CRT_DIRECTORY_SOURCE}")
-  message(FATAL_ERROR "DEBUG_CRT_DIRECTORY_SOURCE = \"${DEBUG_CRT_DIRECTORY_SOURCE}\" doesn't exist!")
-endif()
-
-set(DEBUG_CRT_DIRECTORY_TARGET "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/.nonredist")
-file(MAKE_DIRECTORY "${DEBUG_CRT_DIRECTORY_TARGET}")
-file(GLOB CRT_FILES "${DEBUG_CRT_DIRECTORY_SOURCE}/*")
-
-find_file(UCRTBASED_DLL_PATH
-    NAMES ucrtbased.dll
-    REQUIRED
-)
-
-# TODO: (***) ---> THIS GOES TO <NBL_ROOT_DIR>/docker to CMakeLists.txt file!
+set(CT_ENV_FILE "${CMAKE_CURRENT_BINARY_DIR}/.env")
+string(CONFIGURE [=[
+CT_MOUNT_DIRS=@CT_MOUNT_DIRS@
+NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@
+NBL_EXPLICIT_MODULE_LOAD_LOG=ON
+]=] ENV_CONTENT @ONLY)
+file(WRITE "${CT_ENV_FILE}" "${ENV_CONTENT}")
+list(APPEND DOCKER_CLI_ARGS --env-file "${CT_ENV_FILE}")
 
-set(BASE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022-amd64) # NOTE: HARDCODED CURRENTLY
-
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/package/vulkan:latest" DOCKER_VULKAN_TAG)
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/toolset/redist/${CMAKE_CXX_COMPILER_ID}/crt:latest" DOCKER_CRT_TAG)
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/devel-compiler-explorer-nsc:latest" DOCKER_DEVEL_TAG)
-
-cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH)
-get_filename_component(REDIST_CRT_TOOLSET_VERSION "${TOOLSET_REDIST_PATH}" NAME)
-
-function(GEN_DOCKER_CONTENT _CTX_ _OUTPUT_DIRECTORY_ _EXTRA_DOCKERFILE_CONTENT_ _DOCKER_IGNORE_CONTENT_ _S_NAME_ _CT_NAME_ _IMAGE_NAME_ _WITH_BUILD_) 
-
-set(_OUTPUT_D_PATH_ "${_OUTPUT_DIRECTORY_}/Dockerfile")
-set(_OUTPUT_C_PATH_ "${_OUTPUT_DIRECTORY_}/compose.yml")
-
-string(CONFIGURE "${_EXTRA_DOCKERFILE_CONTENT_}" _EXTRA_DOCKERFILE_CONTENT_EVAL_ @ONLY)
-string(CONFIGURE "${_DOCKER_IGNORE_CONTENT_}" _DOCKER_IGNORE_CONTENT_EVAL_ @ONLY)
-
-unset(DOCKER_CONTENT)
-string(APPEND DOCKER_CONTENT
-[=[
-# escape=`
-
-ARG BASE_IMAGE=@BASE_IMAGE@
-FROM ${BASE_IMAGE}
-SHELL ["cmd", "/S", "/C"]
-@_EXTRA_DOCKERFILE_CONTENT_EVAL_@
-]=]
-)
-
-string(CONFIGURE "${DOCKER_CONTENT}" DOCKER_CONTENT @ONLY)
-file(WRITE "${_OUTPUT_D_PATH_}" "${DOCKER_CONTENT}")
-
-set(_CTX_TARGET_ "${_OUTPUT_DIRECTORY_}/.ctx")
-
-if("${_CTX_}" STREQUAL "")
-
-else()
-  if(NOT EXISTS "${_CTX_}")
-    message(FATAL_ERROR "Invalid source context directory doesn't exist! _CTX_: \"${_CTX_}\"")
-  endif()
-
-  file(COPY "${_CTX_}" DESTINATION "${_CTX_TARGET_}")
-endif()
-
-set(_OUTPUT_I_PATH_ "${_CTX_TARGET_}/.dockerignore")
-
-unset(COMPOSE_CONTENT)
-string(APPEND COMPOSE_CONTENT
-[=[
-services:
-  @_S_NAME_@:
-    build:
-      context: ./.ctx
-      dockerfile: "@_OUTPUT_D_PATH_@"
-    image: @_IMAGE_NAME_@
-    container_name: @_CT_NAME_@
-    networks:
-      docker_default:
-
-networks:
-  docker_default:
-    external: true
-]=]
-)
-
-string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY)
-file(WRITE "${_OUTPUT_C_PATH_}" "${COMPOSE_CONTENT}")
-file(WRITE "${_OUTPUT_I_PATH_}" "${_DOCKER_IGNORE_CONTENT_EVAL_}")
-
-if(_WITH_BUILD_)
-  execute_process(COMMAND "${DOCKER_EXE}" compose -f "${_OUTPUT_C_PATH_}" build)
-endif()
-endfunction()
-
-# Vulkan
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/vulkan")
-set(CT_VULKAN_TARGET vulkan)
-GEN_DOCKER_CONTENT("${VULKAN_SDK_INSTALL_DIRECTORY}" "${OUTPUT_DIRECTORY}"
-[=[
-COPY ./ "@CT_VULKAN_TARGET@"
-
-ENV VULKAN_SDK="C:/@CT_VULKAN_TARGET@"
-ENV VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@"
-LABEL VULKAN_SDK="C:/@CT_VULKAN_TARGET@"
-LABEL VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@"
-]=]
-[=[
-*
-!@VULKAN_SDK_VERSION@/Bin/*.dll
-!@VULKAN_SDK_VERSION@/Bin/*spirv*.exe
-]=]
-nabla-dev-env-vulkan 
-nabla.dev.env.vulkan
-${DOCKER_VULKAN_TAG}
-ON
-)
-
-# CRT
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/crt")
-set(CT_TOOLSET_REDIST_TARGET toolset_redist)
-make_directory("${OUTPUT_DIRECTORY}/.ctx")
-file(COPY "${UCRTBASED_DLL_PATH}" DESTINATION "${OUTPUT_DIRECTORY}/.ctx")
-GEN_DOCKER_CONTENT("${TOOLSET_REDIST_PATH}" "${OUTPUT_DIRECTORY}"
-[=[
-COPY ./ "/@CT_TOOLSET_REDIST_TARGET@"
-
-ENV REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@"
-ENV TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@"
-LABEL REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@"
-LABEL TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@"
-]=]
-[=[
-*
-!ucrtbased.dll
-!@REDIST_CRT_TOOLSET_VERSION@/vc_redist.x64.exe
-!@REDIST_CRT_TOOLSET_VERSION@/@DEBUG_CRT_RELATIVE@/*.dll
-]=]
-nabla-dev-env-crt 
-nabla.dev.env.crt
-${DOCKER_CRT_TAG}
-ON
-)
-
-# Devel, combined
-set(BASE_IMAGE dr.devsh.eu/compiler-explorer/windows)
-
-# NOTE to self: could be all done with single docker file & compose file but buildkit works bad with windows driver, yet need to wait for stuff to be implemented
-set(DEVEL_CTX "${CMAKE_CURRENT_BINARY_DIR}/docker/devel")
-set(CT_REDIST_DIR "${CT_TOOLSET_REDIST_TARGET}/${REDIST_CRT_TOOLSET_VERSION}")
-set(CT_NONREDIST_CTR_DIR "${CT_REDIST_DIR}/${DEBUG_CRT_RELATIVE}")
-cmake_path(NATIVE_PATH CT_REDIST_DIR NORMALIZE CT_REDIST_DIR)
-cmake_path(NATIVE_PATH CT_NONREDIST_CTR_DIR NORMALIZE CT_NONREDIST_CTR_DIR)
-set(DEVEL_DOCKERFILE "${DEVEL_CTX}/Dockerfile")
-
-GEN_DOCKER_CONTENT("" "${DEVEL_CTX}"
-[=[
-
-COPY --link --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@
-COPY --link --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@
-
-# TODO
-# RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install 
-RUN xcopy .\@CT_NONREDIST_CTR_DIR@\*.dll %SystemRoot%\System32 /Y
-RUN xcopy .\@CT_TOOLSET_REDIST_TARGET@\ucrtbased.dll %SystemRoot%\System32 /Y
-
-]=]
-[=[
-
-]=]
-nabla-dev-env-nsc 
-nabla.dev.env.nsc
-${DOCKER_DEVEL_TAG}
-OFF
-)
-
-# <---(***)
-
-set(NABLA_DEV_ENV_CT_NAME dev.nabla.env.${CMAKE_SYSTEM_NAME}.${CMAKE_CXX_COMPILER_ID}.base)
-string(TOLOWER "${NABLA_DEV_ENV_CT_NAME}" NABLA_DEV_ENV_CT_NAME)
-
-set(COMPOSE_NSC_DEV_SERVICE compiler-explorer-nsc-dev)
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/compiler-explorer-nsc:latest" COMPOSE_NSC_DEV_IMAGE)
-string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-production-test:latest" COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE)
-string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-prodution-cache:latest" COMPOSE_NSC_PRODUCTION_CACHE_IMAGE)
-string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc:latest" COMPOSE_NSC_PRODUCTION_IMAGE)
-
-string(APPEND COMPOSE_CONTENT
-[=[
-services:
-  @COMPOSE_NSC_DEV_SERVICE@:
-    container_name: dev.ce.nsc.dev
-    extends:
-        file: @NBL_DOCKER_CE_COMPOSE_BASE@
-        service: compiler-explorer
-    build:
-      context: ./.ctx
-      dockerfile: @DEVEL_DOCKERFILE@
-    image: @COMPOSE_NSC_DEV_IMAGE@
-    environment:
-      NBL_INSTALL_DIRECTORY: "@NBL_INSTALL_DIRECTORY@"
-      NBL_EXPLICIT_MODULE_LOAD_LOG: "ON"
-    entrypoint:
-      - "cmd"
-      - "/c"
-      - >
-        copy C:\\nsc\\install\\hlsl.local.properties.cmake %GIT_GODBOLT_REPOSITORY_PATH%\\etc\\config\\hlsl.local.properties
-        && npm --prefix %GIT_GODBOLT_REPOSITORY_PATH% run dev -- --language hlsl
-    volumes:
-      - type: bind
-        source: .\install
-        target: @NBL_DOCKER_CT_NSC_VOLUME_TARGET@
-        read_only: true
-
-networks:
-  docker_default:
-    external: true
-]=]
-)
-
-string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY)
-file(WRITE "${NBL_DOCKER_CE_COMPOSE_TARGET}" "${COMPOSE_CONTENT}")
-make_directory("${GODBOLT_BINARY_DIRECTORY}/.ctx")
-
-function(_PROMOTE_PROCESS_ISOLATION_ KERNEL BASES VAR)
+set(CT_SETUP_FILE "${CMAKE_CURRENT_BINARY_DIR}/setup.bat")
+string(CONFIGURE [=[
+@echo off
+set "PATH=%PATH%;%CT_MOUNT_DIRS%"
+setx PATH "%PATH%" /M
+node --no-warnings --no-deprecation --import=tsx ./app.js --language hlsl
+]=] SETUP_CONTENT @ONLY)
+file(WRITE "${CT_SETUP_FILE}" "${SETUP_CONTENT}")
+list(APPEND DOCKER_CLI_ARGS)
+
+function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR)
     set(${VAR} True)
-    set(ix 0)
-    list(LENGTH BASES LEN)
-
-    while(ix LESS ${LEN})
-        list(GET BASES ${ix} BASE)
-
-        execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${BASE} RESULT_VARIABLE EXIT_LEVEL OUTPUT_VARIABLE TARGET_KERNEL OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-        if(${EXIT_LEVEL} EQUAL 0)
-            if(${KERNEL} VERSION_LESS ${TARGET_KERNEL})
-                set(${VAR} False PARENT_SCOPE)
-                message(STATUS "While inspecting ${BASE} - host Kernel ${KERNEL} too low to use container process isolation (target ${TARGET_KERNEL}), falling back to HyperV. Please update your host OS.")
-                return()
-            endif()
-            math(EXPR ix "${ix} + 1")
-        else()
-            message(STATUS "Docker image ${BASE} not found locally, pulling...")
-            execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE})
-        endif()
-    endwhile()
+    
+    macro(INSPECT IMAGE)
+      execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE}
+        RESULT_VARIABLE EXIT_LEVEL 
+        OUTPUT_VARIABLE TARGET_KERNEL 
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+    endmacro()
+
+    macro(TO_PROCESS IMAGE TARGET_KERNEL)
+      if(${HOST_KERNEL} VERSION_LESS ${TARGET_KERNEL})
+          set(${VAR} False)
+          message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation for \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.")
+      else()
+        message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation with host kernel [${HOST_KERNEL}] version")
+      endif()
+    endmacro()
+
+    INSPECT(${BASE})
+
+    if(${EXIT_LEVEL} EQUAL 0)
+      TO_PROCESS(${BASE} ${TARGET_KERNEL})
+    else()
+      message(STATUS "\"${BASE}\" not found in local registry, pulling...")
+      execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE})
+
+      INSPECT(${BASE})
+      TO_PROCESS(${BASE} ${TARGET_KERNEL})
+    endif()
 
     set(${VAR} ${${VAR}} PARENT_SCOPE)
 endfunction()
@@ -380,151 +196,46 @@ endfunction()
 execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE)
 string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}")
 
-set(BASES
-  mcr.microsoft.com/windows/nanoserver:ltsc2022
-  mcr.microsoft.com/powershell:lts-nanoserver-ltsc2022
-)
-
-_PROMOTE_PROCESS_ISOLATION_("${HOST_KERNEL}" "${BASES}" PROMOTE_TO_PROCESS)
-
-function(_BUILD_IMAGE_ DOCKERFILE CTX TAG)
-  set(CMD "${DOCKER_EXE}" build)
-  if(PROMOTE_TO_PROCESS)
-    list(APPEND CMD --isolation "process")
-  endif()
-  list(APPEND CMD -t ${TAG} -f "${DOCKERFILE}" .)
-
-  execute_process(COMMAND ${CMD} WORKING_DIRECTORY "${CTX}") 
-endfunction()
-
-_BUILD_IMAGE_("${NBL_DOCKER_CE_DOCKERFILE_BASE}" "${NBL_DOCKER_CE_DOCKER_CTX}" godbolt/base/windows)
-_BUILD_IMAGE_("${DEVEL_DOCKERFILE}" "${DEVEL_CTX}" godbolt/devel/windows)
-
-message(FATAL_ERROR "STOP TEST, PROMOTE_TO_PROCESS = ${PROMOTE_TO_PROCESS}")
-
-string(APPEND BAT_PRODUCTION_INSTALL
-[=[
-@echo off
-setlocal
-
-set BASE_PATH=C:\
-
-xcopy "%BASE_PATH%target" "%BASE_PATH%nsc\install" /s /e /h /i /y /f
-if %ERRORLEVEL% neq 0 (
-    echo [ERROR] Failed to copy C:\target to C:\nsc\install
-    exit /b %ERRORLEVEL%
-)
-
-if "%GIT_GODBOLT_REPOSITORY_PATH%"=="" (
-    echo [ERROR] Environment variable GIT_GODBOLT_REPOSITORY_PATH is not set!
-    exit /b 1
-)
+set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022)
+PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION)
 
-copy "%BASE_PATH%nsc\install\hlsl.local.properties.cmake" "%GIT_GODBOLT_REPOSITORY_PATH%\etc\config\hlsl.local.properties"
-if %ERRORLEVEL% neq 0 (
-    echo [ERROR] Failed to copy HLSL properties file
-    exit /b %ERRORLEVEL%
-)
-
-echo [SUCCESS] All production files copied successfully.
-exit /b 0
-]=]
-)
-
-string(CONFIGURE "${BAT_PRODUCTION_INSTALL}" BAT_PRODUCTION_INSTALL @ONLY)
-file(WRITE "${NBL_DOCKER_CT_NSC_INSTALL_BAT}" "${BAT_PRODUCTION_INSTALL}")
+if(USE_PROCESS_ISOLATION)
+    list(APPEND DOCKER_CLI_ARGS --isolation process)
+endif()
 
+set(ORPHAN nsc-orphan)
 set(NBL_CE_URL http://localhost:80)
 set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py")
 set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py")
 set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json")
 
-add_custom_target(run-compiler-explorer
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Performing Pre-Test..."
+add_custom_target(run-compiler-explorer ALL
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Clearing NSC orphans.."
+    COMMAND "${DOCKER_EXE}" rm -f ${ORPHAN} || "${CMAKE_COMMAND}" -E true
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests.."
     COMMAND "${CTEST_EXE}" -C $<CONFIG> --stop-on-failure
     COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND}
-    COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" stop ${COMPOSE_NSC_DEV_SERVICE}
     COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND}
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "OK! Performing executables hot-swap..."
-    COMMAND "${CMAKE_COMMAND}" -E copy_directory "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}"
-    COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" up -d ${COMPOSE_NSC_DEV_SERVICE}
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Checking health of Compiler Explorer service..."
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 25
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container.."
+    COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --name ${ORPHAN} --entrypoint cmd ${DOCKER_CLI_ARGS} ${BASE_IMAGE}
+    COMMAND "${DOCKER_EXE}" cp "${OUTPUT_CONFIG_FILE}" ${ORPHAN}:C:\\Compiler-Explorer\\etc\\config\\hlsl.local.properties
+    COMMAND "${DOCKER_EXE}" cp "${CT_SETUP_FILE}" ${ORPHAN}:C:\\setup.cmd
+    COMMAND "${DOCKER_EXE}" exec -d ${ORPHAN} C:\\setup.cmd
+    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 25
     COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC is able to compile basic shader file..."
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC container is able to compile basic shader input..."
     COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}"
-    COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC is healthy."
+    COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC container is healthy."
+
     WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
     VERBATIM 
     USES_TERMINAL
 )
 
-add_custom_target(is-compiler-explorer-running
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --ticks 1
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers
-    VERBATIM
-    USES_TERMINAL
-)
-
-# Production NSC image
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/nsc-production")
-set(BASE_IMAGE "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}")
-set(NBL_DOCKER_TMP_PRODUCTION_TARGET "C:\\target")
-GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}"
-[=[
-LABEL maintainer="Arkadiusz Lachowicz <a.lachowicz@devsh.eu>" `
-  org.opencontainers.image.authors="Arkadiusz Lachowicz <a.lachowicz@devsh.eu>" `
-  org.opencontainers.image.title="Compiler Explorer with Nabla Shader Compilers in Docker" `
-  org.opencontainers.image.description="Docker image to run Compiler Explorer instance with Nabla Shader Compilers" `
-  org.opencontainers.image.url="https://github.com/Devsh-Graphics-Programming/Nabla" `
-  org.opencontainers.image.source="https://github.com/Devsh-Graphics-Programming/Nabla" `
-  org.opencontainers.image.documentation="https://github.com/Devsh-Graphics-Programming/Nabla/tree/master/tools/nsc/docker"
-
-ENTRYPOINT ["powershell.exe", "-ExecutionPolicy", "Bypass", "-Command", "npm", "--prefix", "$env:GIT_GODBOLT_REPOSITORY_PATH", "start", "--", "--language", "hlsl"]
-]=]
-[=[
-
-]=]
-nsc-ce-production-cache-webpack
-nsc.ce.production.cache.webpack
-${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE}
-OFF
-)
-
-set(NBL_CE_URL http://localhost:6969)
-
-add_custom_target(create-production-compiler-explorer
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Removing any remaining pre-test orphan containers..." 
-    COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-run-test || "${CMAKE_COMMAND}" -E true
-    COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-cache-webpack || "${CMAKE_COMMAND}" -E true
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Creating pre-test production image..."
-    COMMAND "${DOCKER_EXE}" run -dit -v "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}:${NBL_DOCKER_TMP_PRODUCTION_TARGET}" --name production-ce-orphan-run-test --entrypoint "cmd" "${COMPOSE_NSC_DEV_IMAGE}"
-    COMMAND "${DOCKER_EXE}" exec production-ce-orphan-run-test "${NBL_DOCKER_TMP_PRODUCTION_TARGET}\\${NBL_DOCKER_INSTALL_BAT_FILENAME}"
-    COMMAND "${DOCKER_EXE}" stop production-ce-orphan-run-test
-    COMMAND "${DOCKER_EXE}" commit -m "Copy NSC install redists" production-ce-orphan-run-test "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}"
-    COMMAND "${DOCKER_EXE}" compose build
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running pre-test production image, caching webpack & running final checks..."
-    COMMAND "${DOCKER_EXE}" run -dit -p 6969:10240 --name production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE}"
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 35
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers --disable-cookies --timeout 69
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_release_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_relwithdebinfo_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_debug_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Passed all tests! Creating final production image..."
-    COMMAND "${DOCKER_EXE}" stop production-ce-orphan-cache-webpack
-    COMMAND "${DOCKER_EXE}" commit -m "Perform tests, cache webpack build" production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_IMAGE}"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "Created final `${COMPOSE_NSC_PRODUCTION_IMAGE}` production image!"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "To run the production image, execute: 'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}',"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}'."
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "The production image can be pushed safely to the public registry."
-    WORKING_DIRECTORY "${OUTPUT_DIRECTORY}"
-    VERBATIM
-    USES_TERMINAL
-)
-
 add_dependencies(run-compiler-explorer nsc)
 set_target_properties(run-compiler-explorer PROPERTIES FOLDER "Godbolt")
-set_target_properties(is-compiler-explorer-running PROPERTIES FOLDER "Godbolt")
-set_target_properties(create-production-compiler-explorer PROPERTIES FOLDER "Godbolt")
 
 endif()
\ No newline at end of file

From 2eea2b0fa9f98a6c77ef519c8f9f1ae44e98eb49 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:21:35 +0700
Subject: [PATCH 194/346] Fix layout constness on IComputePipeline

---
 include/nbl/asset/IComputePipeline.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h
index 9ccef877c3..ba4d245473 100644
--- a/include/nbl/asset/IComputePipeline.h
+++ b/include/nbl/asset/IComputePipeline.h
@@ -26,8 +26,8 @@ class IComputePipeline : public IPipeline<PipelineLayoutType>, public IComputePi
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
 
   protected:
-    explicit IComputePipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
-        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)),
+    explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
+        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
         m_params(cachedParams)
     {}
 

From 969bcb821ee38d7333a36b513d6f28e0ba1248fa Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:21:57 +0700
Subject: [PATCH 195/346] Fix ICPUAcclerationStructure to use
 computeDependantsImpl

---
 include/nbl/asset/ICPUAccelerationStructure.h | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 73365cbfce..3370e31cab 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -271,10 +271,12 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-			core::unordered_set<const IAsset*> dependants;
-			for (const auto& instance : *m_instances)
-				dependants.insert(instance.getBase().blas.get());
-			return dependants;
+			return computeDependantsImpl(this);
+		}
+
+		inline core::unordered_set<IAsset*> computeDependants() override
+		{
+			return computeDependantsImpl(this);
 		}
 
 		//
@@ -375,6 +377,16 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		core::smart_refctd_dynamic_array<PolymorphicInstance> m_instances = nullptr;
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT;
+
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, ICPUTopLevelAccelerationStructure>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        core::unordered_set<asset_ptr_t> dependants;
+        for (const auto& instance : *self->m_instances)
+          dependants.insert(instance.getBase().blas.get());
+        return dependants;
+    }
 };
 
 }

From 3e963393781dae658d90942c158a417ed2742aed Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:22:32 +0700
Subject: [PATCH 196/346] Fix ICPUAnimationLibrary to use computeDependantsImpl

---
 include/nbl/asset/ICPUAnimationLibrary.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 8a6cdaf52a..1663447b73 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -98,7 +98,12 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
-			return { m_keyframeStorageBinding.buffer.get(), m_timestampStorageBinding.buffer.get(), m_animationStorageRange.buffer.get() };
+        return computeDependantsImpl(this);
+		}
+
+    inline core::unordered_set<IAsset*> computeDependants() override
+		{
+        return computeDependantsImpl(this);
 		}
 
   private:

From 6de189d2a04c9bf3070ebe40153cbf3c38f405dc Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:23:01 +0700
Subject: [PATCH 197/346] Remove layout constness from ICPUComputePipeline

---
 include/nbl/asset/ICPUComputePipeline.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 27d16461a2..b940c2ae48 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -19,7 +19,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
 
     public:
 
-        static core::smart_refctd_ptr<ICPUComputePipeline> create(const ICPUPipelineLayout* layout)
+        static core::smart_refctd_ptr<ICPUComputePipeline> create(ICPUPipelineLayout* layout)
         {
             auto retval = new ICPUComputePipeline(layout);
             return core::smart_refctd_ptr<ICPUComputePipeline>(retval,core::dont_grab);
@@ -61,14 +61,14 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
     private:
         SShaderSpecInfo m_specInfo;
 
-        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto newPipeline = new ICPUComputePipeline(layout.get());
             newPipeline->m_specInfo = m_specInfo.clone(depth);
             return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
 
-        explicit ICPUComputePipeline(const ICPUPipelineLayout* layout):
+        explicit ICPUComputePipeline(ICPUPipelineLayout* layout):
           base_t(layout, {})
           {}
 

From b0fe0904ef1bfe1bc7459e50628bd851ef0a5d39 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:23:35 +0700
Subject: [PATCH 198/346] Move ICPUDescriptorSet computeDependantsImpl to
 header

---
 include/nbl/asset/ICPUDescriptorSet.h | 33 +++++++++++++++++++++++++++
 src/nbl/asset/ICPUDescriptorSet.cpp   | 33 ---------------------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index c8a6f68d22..2498a438ca 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -87,6 +87,39 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 	private:
 
 		core::smart_refctd_dynamic_array<ICPUDescriptorSet::SDescriptorInfo> m_descriptorInfos[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT)];
+
+    template <typename Self>
+      requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSet>)
+    static auto computeDependantsImpl(Self* self) {
+        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+        core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get() };
+        for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
+        {
+          if (!self->m_descriptorInfos[i]) continue;
+          const auto size = self->m_descriptorInfos[i]->size();
+          for (auto desc_i = 0u; desc_i < size; desc_i++)
+          {
+            auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get();
+            if (!desc) continue;
+            switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+            {
+            case IDescriptor::EC_BUFFER:
+              dependants.insert(static_cast<ICPUBuffer*>(desc));
+            case IDescriptor::EC_SAMPLER:
+              dependants.insert(static_cast<ICPUSampler*>(desc));
+            case IDescriptor::EC_IMAGE:
+              dependants.insert(static_cast<ICPUImageView*>(desc));
+            case IDescriptor::EC_BUFFER_VIEW:
+              dependants.insert(static_cast<ICPUBufferView*>(desc));
+            case IDescriptor::EC_ACCELERATION_STRUCTURE:
+              dependants.insert(static_cast<ICPUTopLevelAccelerationStructure*>(desc));
+            default:
+              break;
+            }
+          }
+        }
+        return dependants;
+    }
 };
 
 }
diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp
index a95074fdb7..730f0847f2 100644
--- a/src/nbl/asset/ICPUDescriptorSet.cpp
+++ b/src/nbl/asset/ICPUDescriptorSet.cpp
@@ -108,39 +108,6 @@ core::smart_refctd_ptr<IAsset> ICPUDescriptorSet::clone(uint32_t _depth) const
 	return cp;
 }
 
-template <typename Self>
-  requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSet>)
-static auto computeDependantsImpl(Self* self) {
-    using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-    core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get() };
-    for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
-    {
-      if (!self->m_descriptorInfos[i]) continue;
-      const auto size = self->m_descriptorInfos[i]->size();
-      for (auto desc_i = 0u; desc_i < size; desc_i++)
-      {
-        auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get();
-        if (!desc) continue;
-        switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
-        {
-        case IDescriptor::EC_BUFFER:
-          dependants.insert(static_cast<ICPUBuffer*>(desc));
-        case IDescriptor::EC_SAMPLER:
-          dependants.insert(static_cast<ICPUSampler*>(desc));
-        case IDescriptor::EC_IMAGE:
-          dependants.insert(static_cast<ICPUImageView*>(desc));
-        case IDescriptor::EC_BUFFER_VIEW:
-          dependants.insert(static_cast<ICPUBufferView*>(desc));
-        case IDescriptor::EC_ACCELERATION_STRUCTURE:
-          dependants.insert(static_cast<ICPUTopLevelAccelerationStructure*>(desc));
-        default:
-          break;
-        }
-      }
-    }
-    return dependants;
-}
-
 core::unordered_set<const IAsset*> ICPUDescriptorSet::computeDependants() const
 {
 	return computeDependantsImpl(this);

From 1d764ec0ca5c16787ec470c7dd1192b292b638eb Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:24:11 +0700
Subject: [PATCH 199/346] Remove layout constness from cpu graphics pipeline

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 6 +++---
 include/nbl/asset/IGraphicsPipeline.h    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 4a7ee3b695..4a1520880d 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -20,7 +20,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 
     public:
         
-        static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(const ICPUPipelineLayout* layout)
+        static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(ICPUPipelineLayout* layout)
         {
             auto retval = new ICPUGraphicsPipeline(layout);
             return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
@@ -79,7 +79,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
 
     private:
-        explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout)
+        explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout)
             : base_t(layout, {}, {})
             {}
 
@@ -108,7 +108,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return dependants;
         }
 
-        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
             newPipeline->m_params = m_params;
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index 090a368c2f..5b445afae5 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -109,8 +109,8 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
         }
 
     protected:
-        explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) :
-            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)),
+        explicit IGraphicsPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, renderpass_t* renderpass) :
+            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
             m_params(cachedParams), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
         {}
 

From 377f25d5e90cb85de3dcc11e6bdbb5d7129c59d2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:24:30 +0700
Subject: [PATCH 200/346] Remove layout constness from cpu pipeline

---
 include/nbl/asset/ICPUPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 9674b872e0..069c9fc35e 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -144,7 +144,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
         using PipelineNonAssetBase::PipelineNonAssetBase;
         virtual ~ICPUPipeline() = default;
         
-        virtual core::smart_refctd_ptr<this_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const = 0;
+        virtual core::smart_refctd_ptr<this_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const = 0;
 
 };
 

From 8809bdad28ace63bf92a87364c645018db772fe1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:25:06 +0700
Subject: [PATCH 201/346] Use computeDependantsImpl in cpu pipeline layout

---
 include/nbl/asset/ICPUPipelineLayout.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index e755a22f07..4b668c1472 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -32,12 +32,12 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
 
         inline core::unordered_set<const IAsset*> computeDependants() const override
         {
-            core::unordered_set<const IAsset*> dependants;
-            for (auto i = 0; i < m_descSetLayouts.size(); i++)
-            {
-                if (m_descSetLayouts[i]) continue;
-                dependants.insert(m_descSetLayouts[i].get());
-            }
+            return computeDependantsImpl(this);
+        }
+
+        inline core::unordered_set<IAsset*> computeDependants() override
+        {
+            return computeDependantsImpl(this);
         }
 
         //

From 3c0b3ba8e30128c32c62e28f65ae77357d4bb0fb Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:25:26 +0700
Subject: [PATCH 202/346] Fix argument pack passing on IGPUPipeline

---
 include/nbl/video/IGPUPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index f9a32786bf..f2e9b79fef 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -116,7 +116,7 @@ class IGPUPipeline : public IBackendObject, public PipelineNonBackendObjectBase,
 
         template <typename... Args>
         explicit IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>&& device, Args&&... args) :
-         PipelineNonBackendObjectBase(std::forward<Args>(args...)), IBackendObject(std::move(device))
+         PipelineNonBackendObjectBase(std::forward<Args>(args)...), IBackendObject(std::move(device))
         {}
         virtual ~IGPUPipeline() = default;
 

From 53b45ec0db44f8e9fc8a7bf9e5a8dbfb70d6ad0f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:25:53 +0700
Subject: [PATCH 203/346] Remove layout constness from cpu ray tracing pipeline

---
 include/nbl/asset/ICPURayTracingPipeline.h | 2 +-
 include/nbl/asset/IRayTracingPipeline.h    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 618c851883..1296d8359a 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -123,7 +123,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return dependants;
         }
 
-        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto newPipeline = new ICPURayTracingPipeline(layout.get());
             newPipeline->m_raygen = m_raygen.clone(depth);
diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h
index 82b47f1fcb..b97d8d7002 100644
--- a/include/nbl/asset/IRayTracingPipeline.h
+++ b/include/nbl/asset/IRayTracingPipeline.h
@@ -48,8 +48,8 @@ class IRayTracingPipeline : public IPipeline<PipelineLayoutType>, public IRayTra
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
 
   protected:
-    explicit IRayTracingPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
-        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(layout)),
+    explicit IRayTracingPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
+        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
         m_params(cachedParams)
     {}
 

From e249931d74f676781980f77b3aab9a5d4da45be1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:26:17 +0700
Subject: [PATCH 204/346] Add cached parameter to SCreationParams for gpu
 compute pipeline

---
 include/nbl/video/IGPUComputePipeline.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 2eb03cf2da..36813699c0 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -65,6 +65,7 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IG
             IGPUPipelineLayout* layout = nullptr;
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
+            SCachedCreationParams cached = {};
             SShaderSpecInfo shader = {};
         };
 
@@ -75,7 +76,7 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IG
 
     protected:
         inline IGPUComputePipeline(const SCreationParams& params) :
-          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), core::smart_refctd_ptr<const IGPUPipelineLayout>(params.layout)), m_flags(params.flags)
+          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached), m_flags(params.flags)
         {}
         virtual ~IGPUComputePipeline() = default;
 

From 006dd7d32bea9197354c9c0ce49e49a8bf4c9c81 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:26:36 +0700
Subject: [PATCH 205/346] Remove layout constness on IPipeline

---
 include/nbl/asset/IPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index d2a85c42fb..eb54542403 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -130,7 +130,7 @@ class IPipeline : public IPipelineBase
 
     protected:
 
-      inline IPipeline(core::smart_refctd_ptr<const PipelineLayout>&& _layout)
+      inline IPipeline(core::smart_refctd_ptr<PipelineLayout>&& _layout)
         : m_layout(std::move(_layout)) {}
 
       core::smart_refctd_ptr<PipelineLayout> m_layout;

From 389c358beb5b42cf47a8fc5712c337d5882afc8c Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 19:26:50 +0700
Subject: [PATCH 206/346] Fix IGPURayTracingPipeline construction

---
 include/nbl/video/IGPURayTracingPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 66e3a01072..beaecd772a 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -170,7 +170,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
         virtual uint16_t getDefaultStackSize() const = 0;
 
     protected:
-        IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params),
+        IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached),
             m_flags(params.flags)
         {}
 

From 81df19b259b2a2a83a411b9cfd55c62361b58769 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 20:24:52 +0700
Subject: [PATCH 207/346] Fix debloatedHitSpecData error in ILogicalDevice

---
 src/nbl/video/ILogicalDevice.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 7714219836..62e364a71a 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -1123,8 +1123,8 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
         {
             *debloatedHitSpecData = {
                 .closestHit = debloatTask.debloat(hit.closestHit, debloatedShaders),
-                .intersection = debloatTask.debloat(hit.intersection, debloatedShaders),
                 .anyHit = debloatTask.debloat(hit.anyHit, debloatedShaders),
+                .intersection = debloatTask.debloat(hit.intersection, debloatedShaders),
             };
             debloatedHitSpecData++;
         }

From 2d97ce83df891174013197e7127bd8bbd54106de Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 26 May 2025 20:25:10 +0700
Subject: [PATCH 208/346] Fix CComputeBlit

---
 include/nbl/asset/IComputePipeline.h       |  1 +
 include/nbl/video/utilities/CComputeBlit.h |  2 +-
 src/nbl/video/utilities/CComputeBlit.cpp   | 20 +++++++++++---------
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h
index ba4d245473..2cb38b39f1 100644
--- a/include/nbl/asset/IComputePipeline.h
+++ b/include/nbl/asset/IComputePipeline.h
@@ -24,6 +24,7 @@ class IComputePipeline : public IPipeline<PipelineLayoutType>, public IComputePi
   public:
 
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
+    inline SCachedCreationParams& getCachedCreationParams() { return m_params; }
 
   protected:
     explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
index 9a02915187..66f6871dc6 100644
--- a/include/nbl/video/utilities/CComputeBlit.h
+++ b/include/nbl/video/utilities/CComputeBlit.h
@@ -67,7 +67,7 @@ class CComputeBlit : public core::IReferenceCounted
 			// required
 			CAssetConverter* converter;
 			// in theory we _could_ accept either pipeline layout type (or just the base) and make the CPU one back from the GPU
-			const asset::ICPUPipelineLayout* layout;
+			asset::ICPUPipelineLayout* layout;
 			// must be Uniform Texel Buffer descriptor type
 			hlsl::SBindingInfo kernelWeights;
 			// must be Sampled Image descriptor type
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 4c3bbaa03c..edac6e1f5c 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -39,7 +39,7 @@ auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) ->
 	const auto sharedMemoryPerInvocation = core::max(singlePixelStorage*4,info.sharedMemoryPerInvocation);
 	retval.sharedMemorySize = sharedMemoryPerInvocation*retval.workgroupSize;
 
-	const auto* layout = info.layout;
+	auto* layout = info.layout;
 
 	// 
 	const auto common = [&]()->std::string
@@ -77,14 +77,16 @@ struct ConstevalParameters
 			source->setContentHash(source->computeContentHash());
 		}
 
-		ICPUComputePipeline::SCreationParams params = {};
-		params.layout = layout;
-		params.shader.entryPoint = "main";
-		params.shader.shader = shader.get();
-		params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(findMSB(limits.maxSubgroupSize));
-		// needed for the prefix and reductions to work
-		params.shader.requireFullSubgroups = true;
-		return ICPUComputePipeline::create(params);
+		auto pipeline = ICPUComputePipeline::create(layout);
+		pipeline->getSpecInfoMut(ESS_COMPUTE)[0] = {
+			.shader = shader,
+			.entryPoint = "main",
+			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(findMSB(limits.maxSubgroupSize)),
+		};
+		pipeline->getCachedCreationParams() = {
+			.requireFullSubgroups = true,
+		};
+		return pipeline;
 	};
 	// create blit pipeline
 	cpuPplns[0] = createPipeline("nbl/builtin/hlsl/blit/default_blit.comp.hlsl");

From 7917918f64409e25052e99d74e1b1343df8d1565 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 26 May 2025 15:39:41 +0200
Subject: [PATCH 209/346] finalize NSC image production from CMake, leave a few
 comments regarding HyperV runner

---
 tools/nsc/CMakeLists.txt | 132 +++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 62 deletions(-)

diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index efe7741f4e..158fd5caf8 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -9,6 +9,8 @@ set(NBL_NSC_PREINSTALL_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.preinstal
 make_directory("${NBL_NSC_PREINSTALL_DIRECTORY}")
 
 set(NBL_DOCKER_CT_NSC_VOLUME_SOURCE "${GODBOLT_BINARY_DIRECTORY}/install")
+set(NBL_DOCKER_CTX_DIR "${GODBOLT_BINARY_DIRECTORY}/.ctx")
+make_directory("${NBL_DOCKER_CTX_DIR}")
 
 set(NBL_DOCKER_INSTALL_BAT_FILENAME install-production.bat)
 set(NBL_DOCKER_CT_NSC_INSTALL_BAT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/${NBL_DOCKER_INSTALL_BAT_FILENAME}")
@@ -57,27 +59,24 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST
 
 if(NBL_ENABLE_DOCKER_INTEGRATION)
 
+set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022)
+
 find_program(CTEST_EXE NAMES ctest REQUIRED)
 find_program(DOCKER_EXE NAMES docker REQUIRED)
 
 find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED)
-cmake_path(GET DXIL_DLL PARENT_PATH DXIL_DIR)
-cmake_path(NATIVE_PATH DXIL_DIR NORMALIZE DXIL_DIR)
 
 find_file(ICU_DLL NAMES icu.dll HINTS REQUIRED)
-cmake_path(GET ICU_DLL PARENT_PATH ICU_DIR)
-cmake_path(NATIVE_PATH ICU_DIR NORMALIZE ICU_DIR)
 set(ICU_GLOBALIZATION_DIR "C:/Windows/Globalization/ICU")
 find_file(ICUDTL_DAT NAMES icudtl.dat HINTS "${ICU_GLOBALIZATION_DIR}" REQUIRED)
 
 find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED)
-cmake_path(GET UCRTBASED_DLL PARENT_PATH UCRTBASED_DIR)
-cmake_path(NATIVE_PATH UCRTBASED_DIR NORMALIZE UCRTBASED_DIR)
 
 find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED)
 cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR)
 cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR)
 
+include(InstallRequiredSystemLibraries)
 cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH)
 
 file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false
@@ -85,33 +84,55 @@ file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false
   "${TOOLSET_REDIST_PATH}/debug_nonredist/x64/*.DebugCRT/*.dll"
 )
 
-foreach(MODULE ${VC_MODULES})
-  get_filename_component(DIR ${MODULE} DIRECTORY)
-  cmake_path(NATIVE_PATH DIR NORMALIZE DIR)
-  list(APPEND VC_MODULE_DIRS ${DIR})
-endforeach()
-
-if(NOT VC_MODULE_DIRS)
+if(NOT VC_MODULES)
   message(FATAL_ERROR "Failed to GLOB for VC Redist modules!")
 endif()
 
+make_directory("${NBL_DOCKER_CTX_DIR}/Runtimes")
+make_directory("${NBL_DOCKER_CTX_DIR}/Nabla")
+execute_process(
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${DXIL_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${ICU_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${UCRTBASED_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${SPIRV_DIS_EXE}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${VC_MODULES} "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different ${ICU_GLOBALIZATION_DIR} "${NBL_DOCKER_CTX_DIR}/Globalization/ICU"
+)
+
 set(CT_RUNTIMES C:/pack/runtimes)
 cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES)
-set(HOST_MOUNT_DIRS ${VC_MODULE_DIRS} ${SPIRV_DIS_DIR} ${UCRTBASED_DIR} ${DXIL_DIR} ${ICU_DIR})
-list(REMOVE_DUPLICATES HOST_MOUNT_DIRS)
 
-set(ix 0)
-foreach(DIR ${HOST_MOUNT_DIRS})
-  set(TARGET_MOUNT_DIR "${CT_RUNTIMES}/system/${ix}")
-  cmake_path(NATIVE_PATH TARGET_MOUNT_DIR NORMALIZE TARGET_MOUNT_DIR)
+set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla")
+cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE)
+cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
+cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY)
+
+string(CONFIGURE [=[
+# syntax=docker/dockerfile:1
+# escape=`
+FROM @BASE_IMAGE@
+USER ContainerAdministrator
 
-  list(APPEND DOCKER_CLI_ARGS -v "${DIR}:${TARGET_MOUNT_DIR}:ro")
-  list(APPEND CT_MOUNT_DIRS "${TARGET_MOUNT_DIR}")
+COPY Runtimes/ C:/Windows/System32/
+COPY Globalization/ICU/ C:/Windows/Globalization/ICU/
 
-  math(EXPR ix "${ix} + 1" OUTPUT_FORMAT DECIMAL)
-endforeach()
+COPY Nabla/ @NBL_DOCKER_CT_NSC_VOLUME_TARGET@
+COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties
+
+ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ `
+NBL_EXPLICIT_MODULE_LOAD_LOG=ON
+
+WORKDIR C:/Compiler-Explorer
+ENTRYPOINT ["node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"]
+]=] INSTRUCTIONS @ONLY)
+
+set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile")
+file(WRITE "${DOCKERFILE}" "${INSTRUCTIONS}")
+
+if(NOT DEFINED NSC_IMAGE_NAME)
+  set(NSC_IMAGE_NAME nano/godbolt/nsc)
+endif()
 
-set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla")
 set(NBL_BUILD_INFO_POSTPROCESS_COMMAND
   "${CMAKE_COMMAND}"
   "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
@@ -120,13 +141,8 @@ set(NBL_BUILD_INFO_POSTPROCESS_COMMAND
   "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>"
   -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
 )
-cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE)
-cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
-cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY)
-list(APPEND DOCKER_CLI_ARGS -v "${NBL_NSC_PREINSTALL_DIRECTORY}:${NBL_DOCKER_CT_NSC_VOLUME_TARGET}")
-#list(APPEND DOCKER_CLI_ARGS -v "${ICU_GLOBALIZATION_DIR}:${ICU_GLOBALIZATION_DIR}:ro")
 
-set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake")
+set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.properties.cmake")
 string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
 set(OUTPUT_CONFIG_FILE $<PATH:NORMAL_PATH,${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}>)
 set(NBL_CE_GENERATE_CONFIG_COMMAND
@@ -139,25 +155,6 @@ set(NBL_CE_GENERATE_CONFIG_COMMAND
   -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
 )
 
-set(CT_ENV_FILE "${CMAKE_CURRENT_BINARY_DIR}/.env")
-string(CONFIGURE [=[
-CT_MOUNT_DIRS=@CT_MOUNT_DIRS@
-NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@
-NBL_EXPLICIT_MODULE_LOAD_LOG=ON
-]=] ENV_CONTENT @ONLY)
-file(WRITE "${CT_ENV_FILE}" "${ENV_CONTENT}")
-list(APPEND DOCKER_CLI_ARGS --env-file "${CT_ENV_FILE}")
-
-set(CT_SETUP_FILE "${CMAKE_CURRENT_BINARY_DIR}/setup.bat")
-string(CONFIGURE [=[
-@echo off
-set "PATH=%PATH%;%CT_MOUNT_DIRS%"
-setx PATH "%PATH%" /M
-node --no-warnings --no-deprecation --import=tsx ./app.js --language hlsl
-]=] SETUP_CONTENT @ONLY)
-file(WRITE "${CT_SETUP_FILE}" "${SETUP_CONTENT}")
-list(APPEND DOCKER_CLI_ARGS)
-
 function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR)
     set(${VAR} True)
     
@@ -172,9 +169,9 @@ function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR)
     macro(TO_PROCESS IMAGE TARGET_KERNEL)
       if(${HOST_KERNEL} VERSION_LESS ${TARGET_KERNEL})
           set(${VAR} False)
-          message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation for \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.")
+          message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation with \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.")
       else()
-        message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation with host kernel [${HOST_KERNEL}] version")
+        message(STATUS "\"${IMAGE}\" [${TARGET_KERNEL}] can be promoted to process isolation with host kernel [${HOST_KERNEL}] version")
       endif()
     endmacro()
 
@@ -195,12 +192,16 @@ endfunction()
 
 execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE)
 string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}")
-
-set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022)
 PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION)
 
 if(USE_PROCESS_ISOLATION)
-    list(APPEND DOCKER_CLI_ARGS --isolation process)
+  set(ISOLATION --isolation process)
+else()
+  # TODO: we will need to use GET_RUNTIME_DEPENDENCIES which uses objdump
+  # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies
+  # to collect *all* required deps and copy (FROM at least server core) to destination
+  # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs
+  message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") # yet
 endif()
 
 set(ORPHAN nsc-orphan)
@@ -210,20 +211,27 @@ set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py")
 set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json")
 
 add_custom_target(run-compiler-explorer ALL
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Clearing NSC orphans.."
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Killing remaining NSC orphans"
     COMMAND "${DOCKER_EXE}" rm -f ${ORPHAN} || "${CMAKE_COMMAND}" -E true
 
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests.."
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests"
     COMMAND "${CTEST_EXE}" -C $<CONFIG> --stop-on-failure
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC build info"
     COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND}
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC godbolt config"
     COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND}
 
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container.."
-    COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --name ${ORPHAN} --entrypoint cmd ${DOCKER_CLI_ARGS} ${BASE_IMAGE}
-    COMMAND "${DOCKER_EXE}" cp "${OUTPUT_CONFIG_FILE}" ${ORPHAN}:C:\\Compiler-Explorer\\etc\\config\\hlsl.local.properties
-    COMMAND "${DOCKER_EXE}" cp "${CT_SETUP_FILE}" ${ORPHAN}:C:\\setup.cmd
-    COMMAND "${DOCKER_EXE}" exec -d ${ORPHAN} C:\\setup.cmd
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 25
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Updating NSC package context"
+    COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CTX_DIR}/Nabla"
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Building NSC Godbolt image"
+    COMMAND "${DOCKER_EXE}" build ${ISOLATION} -f "${DOCKERFILE}" -t ${NSC_IMAGE_NAME} "${NBL_DOCKER_CTX_DIR}"
+
+    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container"
+    COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION} --name ${ORPHAN} ${NSC_IMAGE_NAME}
+    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 12
     COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!"
 
     COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC container is able to compile basic shader input..."

From 27c50d70a595db1a7ca6f029bc7122cb32e438ec Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 26 May 2025 17:00:11 +0200
Subject: [PATCH 210/346] compression!

---
 tools/nsc/CMakeLists.txt | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 158fd5caf8..157c4fa646 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -99,7 +99,7 @@ execute_process(
   COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different ${ICU_GLOBALIZATION_DIR} "${NBL_DOCKER_CTX_DIR}/Globalization/ICU"
 )
 
-set(CT_RUNTIMES C:/pack/runtimes)
+set(CT_RUNTIMES C:/runtimes)
 cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES)
 
 set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla")
@@ -110,20 +110,34 @@ cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL
 string(CONFIGURE [=[
 # syntax=docker/dockerfile:1
 # escape=`
-FROM @BASE_IMAGE@
-USER ContainerAdministrator
 
-COPY Runtimes/ C:/Windows/System32/
-COPY Globalization/ICU/ C:/Windows/Globalization/ICU/
+# ---------------- COMPRESS STEP ----------------
+FROM @BASE_IMAGE@ as compress
+
+COPY --link Runtimes/ C:/pack/Windows/System32/
+COPY --link Globalization/ICU/ C:/pack/Windows/Globalization/ICU/
+COPY --link Nabla/ C:/pack/runtimes/Nabla/
+
+ARG IMPL_COMPRESSION_OPTIONS=-T0
+ARG IMPL_COMPRESSION_LEVEL=3
+
+WORKDIR C:\pack
+RUN `
+tar -cf - Windows | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o windows-artifacts.tar.zst && `
+tar -cf - runtimes | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o nabla-artifacts.tar.zst
+
+# ---------------- FINAL IMAGE ----------------
+FROM @BASE_IMAGE@
 
-COPY Nabla/ @NBL_DOCKER_CT_NSC_VOLUME_TARGET@
+COPY --link --from=compress ["C:/pack/windows-artifacts.tar.zst", "C:/pack/"]
+COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"]
 COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties
 
 ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ `
 NBL_EXPLICIT_MODULE_LOAD_LOG=ON
 
 WORKDIR C:/Compiler-Explorer
-ENTRYPOINT ["node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"]
+ENTRYPOINT ["C:\\unpack.bat", "&&", "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"]
 ]=] INSTRUCTIONS @ONLY)
 
 set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile")

From e3848cef01a6d243d2474c0a195d81204d016406 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 26 May 2025 23:17:19 +0200
Subject: [PATCH 211/346] Make docker in docker build, adjust
 tools/nsc/CMakeLists.txt, update build-nabla.yml to produce NSC image from
 builder container

---
 .github/workflows/build-nabla.yml |   7 +-
 CMakePresets.json                 |   6 +-
 tools/nsc/CMakeLists.txt          | 147 +++++++++++++++++-------------
 3 files changed, 96 insertions(+), 64 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 7dc8759e84..a194734472 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -54,6 +54,11 @@ jobs:
 
       - name: Run Container
         run: |
+          $ctx = docker context show
+          $dockerHost = (docker context inspect $ctx | ConvertFrom-Json).Endpoints.docker.Host
+          $pipeName = [regex]::Match($dockerHost, '/pipe/(?<n>.+)$').Groups['n'].Value
+          $pipeHost = "\\.\pipe\$pipeName"
+          
           docker run `
             --entrypoint ${{ env.entry }} -di --isolation process `
             --env-file .\docker\ci-windows.env `
@@ -87,7 +92,7 @@ jobs:
           docker exec orphan `
             ${{ env.entry }} ${{ env.cmd }} -Command cmake --build `
               --preset ci-build-dynamic-${{ matrix.vendor }} `
-              -t nsc --config ${{ matrix.config }}
+              -t run-compiler-explorer --config ${{ matrix.config }}
 
       - name: Container – Install NSC
         run: |
diff --git a/CMakePresets.json b/CMakePresets.json
index ad3ae50b6d..359ec6fb02 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -91,7 +91,8 @@
 			"inherits": "ci-configure-static-windows-base",
 			"generator": "Ninja Multi-Config",
 			"cacheVariables": {
-				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake"
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake",
+				"NBL_ENABLE_DOCKER_INTEGRATION": "ON"
 			}
 		},
 		{
@@ -99,7 +100,8 @@
 			"inherits": "ci-configure-dynamic-windows-base",
 			"generator": "Ninja Multi-Config",
 			"cacheVariables": {
-				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake"
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake",
+				"NBL_ENABLE_DOCKER_INTEGRATION": "ON"
 			}
 		},
 		{
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 157c4fa646..b0fec5b7f2 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -66,10 +66,7 @@ find_program(DOCKER_EXE NAMES docker REQUIRED)
 
 find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED)
 
-find_file(ICU_DLL NAMES icu.dll HINTS REQUIRED)
-set(ICU_GLOBALIZATION_DIR "C:/Windows/Globalization/ICU")
-find_file(ICUDTL_DAT NAMES icudtl.dat HINTS "${ICU_GLOBALIZATION_DIR}" REQUIRED)
-
+set(ICU_GLOBALIZATION_DIR C:\\Windows\\Globalization\\ICU)
 find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED)
 
 find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED)
@@ -77,6 +74,15 @@ cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR)
 cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR)
 
 include(InstallRequiredSystemLibraries)
+
+if(NOT MSVC_REDIST_DIR)
+  if(MSVC_REDIST_BASE) # fallback to our CI toolset
+    set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}")
+  else()
+    message(FATAL_ERROR "Could not find MSVC_REDIST_DIR, define yourself!")
+  endif()
+endif()
+
 cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH)
 
 file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false
@@ -92,11 +98,9 @@ make_directory("${NBL_DOCKER_CTX_DIR}/Runtimes")
 make_directory("${NBL_DOCKER_CTX_DIR}/Nabla")
 execute_process(
   COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${DXIL_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${ICU_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
   COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${UCRTBASED_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
   COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${SPIRV_DIS_EXE}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
   COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${VC_MODULES} "${NBL_DOCKER_CTX_DIR}/Runtimes"
-  COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different ${ICU_GLOBALIZATION_DIR} "${NBL_DOCKER_CTX_DIR}/Globalization/ICU"
 )
 
 set(CT_RUNTIMES C:/runtimes)
@@ -107,6 +111,7 @@ cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_N
 cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
 cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY)
 
+set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022)
 string(CONFIGURE [=[
 # syntax=docker/dockerfile:1
 # escape=`
@@ -114,8 +119,9 @@ string(CONFIGURE [=[
 # ---------------- COMPRESS STEP ----------------
 FROM @BASE_IMAGE@ as compress
 
+COPY --link --from=@CORE_IMAGE@ C:/Windows/System32/icu.dll C:/pack/Windows/System32/
+COPY --link --from=@CORE_IMAGE@ C:/Windows/Globalization/ICU/ C:/pack/Windows/Globalization/ICU/
 COPY --link Runtimes/ C:/pack/Windows/System32/
-COPY --link Globalization/ICU/ C:/pack/Windows/Globalization/ICU/
 COPY --link Nabla/ C:/pack/runtimes/Nabla/
 
 ARG IMPL_COMPRESSION_OPTIONS=-T0
@@ -134,7 +140,8 @@ COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"]
 COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties
 
 ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ `
-NBL_EXPLICIT_MODULE_LOAD_LOG=ON
+NBL_EXPLICIT_MODULE_LOAD_LOG=ON `
+ICU_DATA=C:\Windows\Globalization\ICU
 
 WORKDIR C:/Compiler-Explorer
 ENTRYPOINT ["C:\\unpack.bat", "&&", "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"]
@@ -147,27 +154,9 @@ if(NOT DEFINED NSC_IMAGE_NAME)
   set(NSC_IMAGE_NAME nano/godbolt/nsc)
 endif()
 
-set(NBL_BUILD_INFO_POSTPROCESS_COMMAND
-  "${CMAKE_COMMAND}"
-  "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
-  "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-  "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-  "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>"
-  -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
-)
-
 set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.properties.cmake")
 string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
 set(OUTPUT_CONFIG_FILE $<PATH:NORMAL_PATH,${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}>)
-set(NBL_CE_GENERATE_CONFIG_COMMAND
-  "${CMAKE_COMMAND}"
-  "-DSPIRV_DIS_EXE=spirv-dis.exe"
-  "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
-  "-DNSC_RELWITHDEBINFO_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
-  "-DNSC_DEBUG_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
-  "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}"
-  -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
-)
 
 function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR)
     set(${VAR} True)
@@ -209,52 +198,88 @@ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}")
 PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION)
 
 if(USE_PROCESS_ISOLATION)
-  set(ISOLATION --isolation process)
+  set(ISOLATION "--isolation process")
 else()
-  # TODO: we will need to use GET_RUNTIME_DEPENDENCIES which uses objdump
+  # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump
   # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies
-  # to collect *all* required deps and copy (FROM at least server core) to destination
+  # to collect *all* missing deps and copy (FROM at least server core) to destination nano
   # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs
-  message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") # yet
+  message(FATAL_ERROR "HyperV is NOT supported! Update your OS!")
 endif()
 
 set(ORPHAN nsc-orphan)
-set(NBL_CE_URL http://localhost:80)
+set(NBL_CE_URL http://${ORPHAN}:10240)
 set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py")
 set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py")
 set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json")
 
-add_custom_target(run-compiler-explorer ALL
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Killing remaining NSC orphans"
-    COMMAND "${DOCKER_EXE}" rm -f ${ORPHAN} || "${CMAKE_COMMAND}" -E true
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests"
-    COMMAND "${CTEST_EXE}" -C $<CONFIG> --stop-on-failure
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC build info"
-    COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND}
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC godbolt config"
-    COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND}
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Updating NSC package context"
-    COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CTX_DIR}/Nabla"
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Building NSC Godbolt image"
-    COMMAND "${DOCKER_EXE}" build ${ISOLATION} -f "${DOCKERFILE}" -t ${NSC_IMAGE_NAME} "${NBL_DOCKER_CTX_DIR}"
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container"
-    COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION} --name ${ORPHAN} ${NSC_IMAGE_NAME}
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 12
-    COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!"
-
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC container is able to compile basic shader input..."
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}"
-    COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC container is healthy."
+# to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory
+string(CONFIGURE [=[
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Killing remaining NSC orphans")
+execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}")
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Executing CTests")
+execute_process(COMMAND "${CTEST_EXE}" -C "$<CONFIG>" --stop-on-failure WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" 
+                COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC build info")
+execute_process(COMMAND "${CMAKE_COMMAND}"
+                        "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
+                        "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
+                        "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
+                        "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>"
+                        -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
+                        COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC godbolt config")
+execute_process(COMMAND "${CMAKE_COMMAND}"
+                        "-DSPIRV_DIS_EXE=spirv-dis.exe"
+                        "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
+                        "-DNSC_RELWITHDEBINFO_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
+                        "-DNSC_DEBUG_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
+                        "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}"
+                        -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
+                        COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Updating NSC package context")
+execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different
+                         "$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}>"
+                         "${NBL_DOCKER_CTX_DIR}/Nabla"
+                         COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Building NSC Godbolt image")
+execute_process(COMMAND "${DOCKER_EXE}" build ${ISOLATION}
+                         -f "${DOCKERFILE}"
+                         -t ${NSC_IMAGE_NAME}
+                         "${NBL_DOCKER_CTX_DIR}"
+                         COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Running new NSC orphan container")
+execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION}
+                         --name "${ORPHAN}" ${NSC_IMAGE_NAME}
+                         COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Health‐check")
+execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}"
+                         --url "${NBL_CE_URL}" --interval 5 --ticks 12
+                         COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Post‐Checking basic shader compile")
+execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}"
+                         --url "${NBL_CE_URL}"
+                         --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile
+                         --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}"
+                         COMMAND_ERROR_IS_FATAL ANY)
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "OK! NSC container is healthy.")
+]=] INSTRUCTIONS)
+
+file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake" CONTENT "${INSTRUCTIONS}")
 
-    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-    VERBATIM 
-    USES_TERMINAL
+add_custom_target(run-compiler-explorer ALL
+    COMMAND "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake"
+    VERBATIM
+    COMMAND_EXPAND_LISTS
 )
 
 add_dependencies(run-compiler-explorer nsc)

From 350c6a3604999abb23d133c8affa3a456181dfdc Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 11:25:25 +0700
Subject: [PATCH 212/346] more util funcs in config, fix some calculations

---
 examples_tests                                |  2 +-
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 48 ++++++++---------
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 52 +++++++++----------
 3 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/examples_tests b/examples_tests
index bb3a901b5d..2a85f4e091 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit bb3a901b5de72b78246af20072f4489960287204
+Subproject commit 2a85f4e0911185a85df31f798b92e6902db3383e
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 512641abb8..8ecbe4b5dc 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -19,9 +19,9 @@ template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
     static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
-    // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
+    static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16");
     NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, SubgroupSizeLog2*levels, WorkgroupSizeLog2>;
     // must have at least enough level 0 outputs to feed a single subgroup
 };
 
@@ -33,24 +33,6 @@ struct items_per_invocation
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
 };
-
-// explicit specializations for cases that don't fit
-#define SPECIALIZE_VIRTUAL_WG_SIZE_CASE(WGLOG2, SGLOG2, LEVELS, VALUE) template<>\
-struct virtual_wg_size_log2<WGLOG2, SGLOG2>\
-{\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = LEVELS;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = VALUE;\
-};\
-
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(11,4,3,12);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(7,7,1,7);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(6,6,1,6);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(5,5,1,5);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(4,4,1,4);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(3,3,1,3);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(2,2,1,2);
-
-#undef SPECIALIZE_VIRTUAL_WG_SIZE_CASE
 }
 
 template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
@@ -71,16 +53,32 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value<LevelCount==1,uint16_t,0,conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupSize*ItemsPerInvocation_1>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
+        0,
+        conditional_value<LevelCount==3,uint16_t,
+            SubgroupSize*ItemsPerInvocation_2,
+            0
+            >::value + SubgroupSize*ItemsPerInvocation_1
+        >::value;
+
+    static bool electLast()
+    {
+        return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
+    }
+
+    static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t virtualIdx)
+    {
+        return virtualIdx * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
+    }
 
-    static uint32_t virtualSubgroupID(const uint32_t id, const uint32_t offset)
+    static uint32_t sharedCoalescedIndexNextLevel(const uint32_t subgroupID, const uint32_t itemsPerInvocation)
     {
-        return offset * (WorkgroupSize >> SubgroupSizeLog2) + id;
+        return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation);
     }
 
-    static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation)
+    static uint32_t sharedCoalescedIndexByComponent(const uint32_t invocationIndex, const uint32_t component)
     {
-        return (id & (itemsPerInvocation-1)) * SubgroupSize + (id/itemsPerInvocation);
+        return component * SubgroupSize + invocationIndex;
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index d44271a260..dd309e0e12 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -104,10 +104,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv0_t scan_local;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -120,10 +120,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
 
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
                 scratchAccessor.template set<scalar_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -159,10 +159,10 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -176,12 +176,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+prevIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -193,7 +193,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
             scratchAccessor.template get<scalar_t>(bankedIndex,left);
             if (Exclusive)
@@ -242,10 +242,10 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv0_t scan_local;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -258,11 +258,11 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
                 scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -275,7 +275,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             scratchAccessor.template set<scalar_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
@@ -314,10 +314,10 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -332,15 +332,15 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+prevIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
+            if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
                 scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -354,12 +354,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent(prevIndex, i),lv2_val[i]);
             lv2_val[0] = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val[0], bool(invocationIndex));
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+invocationIndex,lv2_val[i]);
+                scratchAccessor.template set<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -372,12 +372,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
 
             scalar_t lv2_scan;
-            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
             scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex, binop(lv1_val[i],lv2_scan));
+                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i), binop(lv1_val[i],lv2_scan));
         }
 
         // combine with level 0
@@ -388,7 +388,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
             scratchAccessor.template get<scalar_t>(bankedIndex,left);
             if (Exclusive)

From 14e5d15b830376e91de7066e233bdf0108230863 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 12:24:17 +0700
Subject: [PATCH 213/346] added generic data/shared mem accessors

---
 .../builtin/hlsl/concepts/accessors/fft.hlsl  | 44 ++------------
 .../accessors/generic_shared_data.hlsl        | 59 +++++++++++++++++++
 2 files changed, 64 insertions(+), 39 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl

diff --git a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
index 262cb3c0c7..9088b0c7b4 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
@@ -1,7 +1,7 @@
 #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_
 
-#include "nbl/builtin/hlsl/concepts.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl"
 #include "nbl/builtin/hlsl/fft/common.hlsl"
 
 namespace nbl
@@ -17,49 +17,15 @@ namespace fft
 //      * void set(uint32_t index, in uint32_t value); 
 //      * void workgroupExecutionAndMemoryBarrier();
 
-#define NBL_CONCEPT_NAME FFTSharedMemoryAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t, uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t, uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
-
+template<typename T, typename V=uint32_t, typename I=uint32_t>
+NBL_BOOL_CONCEPT FFTSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
 
 // The Accessor (for a small FFT) MUST provide the following methods:
 //     * void get(uint32_t index, NBL_REF_ARG(complex_t<Scalar>) value);
 //     * void set(uint32_t index, in complex_t<Scalar> value);
 
-#define NBL_CONCEPT_NAME FFTAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(Scalar)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, complex_t<Scalar>)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<complex_t<Scalar> >(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<complex_t<Scalar> >(index, val)), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+template<typename T, typename Scalar, typename I=uint32_t>
+NBL_BOOL_CONCEPT FFTAccessor = concepts::accessors::GenericDataAccessor<T,complex_t<Scalar>,I>;
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
new file mode 100644
index 0000000000..4e6b974249
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
@@ -0,0 +1,59 @@
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+namespace accessors
+{
+
+#define NBL_CONCEPT_NAME GenericSharedMemoryAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (index, I)
+#define NBL_CONCEPT_PARAM_2 (val, V)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<I,V>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<I,V>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+#define NBL_CONCEPT_NAME GenericDataAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (index, I)
+#define NBL_CONCEPT_PARAM_2 (val, V)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V>(index, val)), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+}
+}
+}
+}
+
+#endif

From f07329e42145deff72b832faf4bf07b6ada39e5e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 13:47:25 +0700
Subject: [PATCH 214/346] fix include guard

---
 .../builtin/hlsl/concepts/accessors/generic_shared_data.hlsl  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
index 4e6b974249..db71228162 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
@@ -1,5 +1,5 @@
-#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
-#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_
 
 #include "nbl/builtin/hlsl/concepts.hlsl"
 

From 48a7d161aeb5b921cb5211465ec2d4cbcc177fe9 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 13:49:40 +0700
Subject: [PATCH 215/346] changes to arithmetic accessor concepts

---
 examples_tests                                |  2 +-
 .../accessors/workgroup_arithmetic.hlsl       | 38 ++++++-------------
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   | 12 +++---
 3 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/examples_tests b/examples_tests
index 2a85f4e091..99f6dfe5b4 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 2a85f4e0911185a85df31f798b92e6902db3383e
+Subproject commit 99f6dfe5b4345cc8bbe7ff2ab2353993e395d3bd
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
index de5e5a3c35..cbccbec034 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
@@ -1,7 +1,7 @@
 #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
 #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
 
-#include "nbl/builtin/hlsl/concepts.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl"
 
 namespace nbl
 {
@@ -10,46 +10,30 @@ namespace hlsl
 namespace workgroup2
 {
 
-#define NBL_CONCEPT_NAME ArithmeticSharedMemoryAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+template<typename T, typename V, typename I>
+NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
 
-#define NBL_CONCEPT_NAME ArithmeticDataAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_NAME ArithmeticReadOnlyDataAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)
 #define NBL_CONCEPT_PARAM_0 (accessor, T)
 #define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
+#define NBL_CONCEPT_PARAM_2 (val, V)
 NBL_CONCEPT_BEGIN(3)
 #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
 #define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
 #define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
 NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V>(index, val)), is_same_v, void))
 );
 #undef val
 #undef index
 #undef accessor
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor<T,V,I>;
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index e4a71bdffc..6702504fa8 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -6,8 +6,6 @@
 
 
 #include "nbl/builtin/hlsl/functional.hlsl"
-#include "nbl/builtin/hlsl/workgroup/ballot.hlsl"
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
 #include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
 
@@ -24,7 +22,7 @@ struct reduction
 {
     using scalar_t = typename BinOp::type_t;
 
-    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<ReadOnlyDataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t,scalar_t>)
     static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
@@ -35,7 +33,9 @@ struct reduction
 template<class Config, class BinOp, class device_capabilities=void>
 struct inclusive_scan
 {
-    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
+    using scalar_t = typename BinOp::type_t;
+
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t,scalar_t>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
@@ -46,7 +46,9 @@ struct inclusive_scan
 template<class Config, class BinOp, class device_capabilities=void>
 struct exclusive_scan
 {
-    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
+    using scalar_t = typename BinOp::type_t;
+
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t,scalar_t>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;

From 20a54be14f624eb59e7030b2d14294f224e87750 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 15:23:28 +0700
Subject: [PATCH 216/346] concept macro for checking types

---
 include/nbl/builtin/hlsl/concepts.hlsl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index 7fd725dc2b..4b82955bb7 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -33,6 +33,7 @@ namespace concepts
 #define NBL_CONCEPT_REQ_EXPR 1
 //
 #define NBL_CONCEPT_REQ_EXPR_RET_TYPE 2
+#define NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT 3
 
 
 //! Now diverge
@@ -64,8 +65,9 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__;
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__;
 #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C<decltype E __VA_OPT__(,) __VA_ARGS__ >;
+#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) C< __VA_ARGS__ >;
 //
-#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
+#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)
 //
 #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT) BOOST_PP_SEQ_TAIL(e))
 //
@@ -95,8 +97,9 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t<typename __VA_ARGS__ >
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t<decltype(__VA_ARGS__)>
 #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t<C<decltype E __VA_OPT__(,) __VA_ARGS__  > >
+#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) ::nbl::hlsl::enable_if_t<C< __VA_ARGS__ > >
 //
-#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
+#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)
 //
 #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template<NBL_CONCEPT_FULL_TPLT(), typename=void> \
 struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \

From d83ac5cbf9301b173c8199118f0d9937c80e5186 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 15:41:20 +0700
Subject: [PATCH 217/346] revert concept macro addition

---
 include/nbl/builtin/hlsl/concepts.hlsl | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index 4b82955bb7..7fd725dc2b 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -33,7 +33,6 @@ namespace concepts
 #define NBL_CONCEPT_REQ_EXPR 1
 //
 #define NBL_CONCEPT_REQ_EXPR_RET_TYPE 2
-#define NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT 3
 
 
 //! Now diverge
@@ -65,9 +64,8 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__;
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__;
 #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C<decltype E __VA_OPT__(,) __VA_ARGS__ >;
-#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) C< __VA_ARGS__ >;
 //
-#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)
+#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
 //
 #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT) BOOST_PP_SEQ_TAIL(e))
 //
@@ -97,9 +95,8 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t<typename __VA_ARGS__ >
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t<decltype(__VA_ARGS__)>
 #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t<C<decltype E __VA_OPT__(,) __VA_ARGS__  > >
-#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) ::nbl::hlsl::enable_if_t<C< __VA_ARGS__ > >
 //
-#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)
+#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
 //
 #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template<NBL_CONCEPT_FULL_TPLT(), typename=void> \
 struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \

From 00787bf305da99a9a13580dbe39faf95ddf05d72 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 15:42:38 +0700
Subject: [PATCH 218/346] added generic read/write accessors

---
 .../accessors/generic_shared_data.hlsl        | 46 +++++++++++++------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
index db71228162..cc22595444 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
@@ -16,15 +16,15 @@ namespace accessors
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
 #define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
 #define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, I)
-#define NBL_CONCEPT_PARAM_2 (val, V)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
 NBL_CONCEPT_BEGIN(3)
 #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
 NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<I,V>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<I,V>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V,I>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V,I>(index, val)), is_same_v, void))
     ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
 );
 #undef val
@@ -32,25 +32,45 @@ NBL_CONCEPT_END(
 #undef accessor
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
-#define NBL_CONCEPT_NAME GenericDataAccessor
+#define NBL_CONCEPT_NAME GenericReadAccessor
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
 #define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
 #define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, I)
-#define NBL_CONCEPT_PARAM_2 (val, V)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
 NBL_CONCEPT_BEGIN(3)
 #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
 NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V,I>(index, val)), is_same_v, void))
 );
 #undef val
 #undef index
 #undef accessor
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
+#define NBL_CONCEPT_NAME GenericWriteAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V,I>(index, val)), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT GenericDataAccessor = GenericWriteAccessor<T,V,I> && GenericWriteAccessor<T,V,I>;
+
 }
 }
 }

From c0dfc1eeddac4378dd8fc836ddb71efe7e9ee5b3 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 15:43:37 +0700
Subject: [PATCH 219/346] more refactor for accessor concept changes

---
 .../accessors/workgroup_arithmetic.hlsl       |  21 +---
 .../hlsl/workgroup2/arithmetic_config.hlsl    |   8 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 105 +++++++++---------
 3 files changed, 59 insertions(+), 75 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
index cbccbec034..267342634f 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
@@ -10,26 +10,11 @@ namespace hlsl
 namespace workgroup2
 {
 
-template<typename T, typename V, typename I>
+template<typename T, typename V, typename I=uint32_t>
 NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
 
-#define NBL_CONCEPT_NAME ArithmeticReadOnlyDataAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, V)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V>(index, val)), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticReadOnlyDataAccessor = concepts::accessors::GenericReadAccessor<T,V,I>;
 
 template<typename T, typename V, typename I=uint32_t>
 NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor<T,V,I>;
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 8ecbe4b5dc..7611036a49 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -66,17 +66,17 @@ struct ArithmeticConfiguration
         return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
     }
 
-    static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t virtualIdx)
+    static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex)
     {
-        return virtualIdx * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
+        return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
     }
 
-    static uint32_t sharedCoalescedIndexNextLevel(const uint32_t subgroupID, const uint32_t itemsPerInvocation)
+    static uint32_t sharedStoreIndex(const uint32_t subgroupID, const uint32_t itemsPerInvocation)
     {
         return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation);
     }
 
-    static uint32_t sharedCoalescedIndexByComponent(const uint32_t invocationIndex, const uint32_t component)
+    static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component)
     {
         return component * SubgroupSize + invocationIndex;
     }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index dd309e0e12..96b2ffdd97 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -33,7 +33,7 @@ struct reduce<Config, BinOp, 1, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    // doesn't use scratch smem, need as param?
+    // doesn't use scratch smem, should be NOOP accessor
 
     template<class DataAccessor, class ScratchAccessor>
     scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -43,10 +43,8 @@ struct reduce<Config, BinOp, 1, device_capabilities>
 
         subgroup2::reduction<params_t> reduction;
         vector_t value;
-        dataAccessor.template get<vector_t>(workgroup::SubgroupContiguousIndex(), value);
-        value = reduction(value);
-        return value[0];
-        // dataAccessor.template set<vector_t>(workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.template get<vector_t, uint32_t>(glsl::gl_SubgroupInvocationID(), value);
+        return reduction(value);
     }
 };
 
@@ -55,7 +53,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
 {
     using scalar_t = typename BinOp::type_t;
     using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    // doesn't use scratch smem, need as param?
+    // doesn't use scratch smem, should be NOOP accessor
 
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -64,7 +62,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
 
         vector_t value;
-        dataAccessor.template get<vector_t>(workgroup::SubgroupContiguousIndex(), value);
+        dataAccessor.template get<vector_t, uint32_t>(glsl::gl_SubgroupInvocationID(), value);
         if (Exclusive)
         {
             subgroup2::exclusive_scan<params_t> excl_scan;
@@ -75,7 +73,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
             subgroup2::inclusive_scan<params_t> incl_scan;
             value = incl_scan(value);
         }
-        dataAccessor.template set<vector_t>(workgroup::SubgroupContiguousIndex(), value);   // can be safely merged with above lines?
+        dataAccessor.template set<vector_t>(glsl::gl_SubgroupInvocationID(), value);
     }
 };
 
@@ -102,13 +100,13 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t scan_local;
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
             if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -120,16 +118,16 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
 
             if (Config::electLast())
-                scratchAccessor.template set<scalar_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t, uint32_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         scalar_t reduce_val;
-        scratchAccessor.template get<scalar_t>(0,reduce_val);
+        scratchAccessor.template get<scalar_t, uint32_t>(0,reduce_val);
         return reduce_val;
     }
 };
@@ -156,14 +154,14 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -176,12 +174,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -190,12 +188,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
-            scratchAccessor.template get<scalar_t>(bankedIndex,left);
+            scratchAccessor.template get<scalar_t, uint32_t>(bankedIndex,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
@@ -210,7 +208,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     value[i] = binop(left, value[i]);
             }
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
         }
     }
 };
@@ -240,30 +238,31 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t scan_local;
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
             if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
-                scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 1 scan
+        const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
         subgroup2::reduction<params_lv1_t> reduction1;
-        if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1)
+        if (glsl::gl_SubgroupID() < Config::SubgroupSize*Config::ItemsPerInvocation_2)
         {
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
-                scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                const uint32_t bankedIndex = Config::sharedStoreIndex(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
+                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -275,14 +274,14 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
-            scratchAccessor.template set<scalar_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
+            scratchAccessor.template set<scalar_t, uint32_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         scalar_t reduce_val;
-        scratchAccessor.template get<scalar_t>(0,reduce_val);
+        scratchAccessor.template get<scalar_t, uint32_t>(0,reduce_val);
         return reduce_val;
     }
 };
@@ -311,14 +310,14 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
-                scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -332,16 +331,16 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
-                scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -354,12 +353,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent(prevIndex, i),lv2_val[i]);
-            lv2_val[0] = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val[0], bool(invocationIndex));
+                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(prevIndex, i),lv2_val[i]);
+            lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -369,15 +368,15 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
 
             scalar_t lv2_scan;
-            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
-            scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv2_scan);
+            const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+            scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i), binop(lv1_val[i],lv2_scan));
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i), binop(lv1_val[i],lv2_scan));
         }
 
         // combine with level 0
@@ -385,12 +384,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
-            scratchAccessor.template get<scalar_t>(bankedIndex,left);
+            scratchAccessor.template get<scalar_t, uint32_t>(bankedIndex,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
@@ -405,7 +404,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     value[i] = binop(left, value[i]);
             }
-            dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
         }
     }
 };

From 55840a3063fb64ef79f84ffc51b6392fbed1530e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 16:13:50 +0700
Subject: [PATCH 220/346] don't pass scalar_t as index type

---
 examples_tests                                      | 2 +-
 include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples_tests b/examples_tests
index 99f6dfe5b4..3d898943fb 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 99f6dfe5b4345cc8bbe7ff2ab2353993e395d3bd
+Subproject commit 3d898943fb9bd4690aa3b92b7a80f5a61198f0de
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index 6702504fa8..643f8d123e 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -22,7 +22,7 @@ struct reduction
 {
     using scalar_t = typename BinOp::type_t;
 
-    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t,scalar_t>)
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
     static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
@@ -35,7 +35,7 @@ struct inclusive_scan
 {
     using scalar_t = typename BinOp::type_t;
 
-    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t,scalar_t>)
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
@@ -48,7 +48,7 @@ struct exclusive_scan
 {
     using scalar_t = typename BinOp::type_t;
 
-    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t,scalar_t>)
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;

From d758ff7474aecd42c1ec11769482fed9e70b0d9e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 16:31:39 +0700
Subject: [PATCH 221/346] refactor accessor to match accessor template

---
 examples_tests                                |  2 +-
 include/nbl/builtin/hlsl/memory_accessor.hlsl | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples_tests b/examples_tests
index 3d898943fb..3d63ed7328 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3d898943fb9bd4690aa3b92b7a80f5a61198f0de
+Subproject commit 3d63ed732838c3073dfb7993d3eb1305fb5882be
diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl
index 99ec0736a4..2194b1e917 100644
--- a/include/nbl/builtin/hlsl/memory_accessor.hlsl
+++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl
@@ -112,8 +112,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase<IndexType,ElementStride,S
     BaseAccessor accessor;
 
     // Question: shall we go back to requiring a `access_t get(index_t)` on the `BaseAccessor`, then we could `enable_if` check the return type (via `has_method_get`) matches and we won't get Nasty HLSL copy-in copy-out conversions
-    template<typename T>
-    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> get(const index_t ix, NBL_REF_ARG(T) value)
+    template<typename T, typename I=index_t>
+    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> get(const I ix, NBL_REF_ARG(T) value)
     {
         NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t);
         // `vector` for now, we'll use `array` later when `bit_cast` gets fixed
@@ -123,8 +123,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase<IndexType,ElementStride,S
         value = bit_cast<T,vector<access_t,SubElementCount> >(aux);
     }
 
-    template<typename T>
-    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> set(const index_t ix, NBL_CONST_REF_ARG(T) value)
+    template<typename T, typename I=index_t>
+    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> set(const I ix, NBL_CONST_REF_ARG(T) value)
     { 
         NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t);
         // `vector` for now, we'll use `array` later when `bit_cast` gets fixed
@@ -209,11 +209,11 @@ struct Offset : impl::OffsetBase<IndexType,_Offset>
 
     BaseAccessor accessor;
 
-    template <typename T>
-    void set(index_t idx, T value) {accessor.set(idx+base_t::offset,value); }
+    template <typename T, typename I=index_t>
+    void set(I idx, T value) {accessor.set(idx+base_t::offset,value); }
 
-    template <typename T> 
-    void get(index_t idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);}
+    template <typename T, typename I=index_t> 
+    void get(I idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);}
     
     template<typename S=BaseAccessor>
     enable_if_t<

From b062ede97571b771c36f2a674045367baee901f7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 17:18:41 +0700
Subject: [PATCH 222/346] simplified indexing functions

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 15 +++++++--
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 33 ++++++++-----------
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 7611036a49..e02c74e80b 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -71,9 +71,20 @@ struct ArithmeticConfiguration
         return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
     }
 
-    static uint32_t sharedStoreIndex(const uint32_t subgroupID, const uint32_t itemsPerInvocation)
+    template<uint16_t level>
+    static uint32_t sharedStoreIndex(const uint32_t subgroupID)
     {
-        return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation);
+        if (level<2)
+            return (subgroupID & (ItemsPerInvocation_1-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_1);
+        else
+            return (subgroupID & (ItemsPerInvocation_2-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_2);
+    }
+
+    template<uint16_t level>
+    static uint32_t sharedStoreIndexFromVirtualIndex(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex)
+    {
+        const uint32_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
+        return sharedStoreIndex<level>(virtualID);
     }
 
     static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component)
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 96b2ffdd97..418c3219f4 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -104,8 +104,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             scan_local = reduction0(scan_local);
             if (Config::electLast())
             {
-                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
                 scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -159,8 +158,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (Config::electLast())
             {
-                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
                 scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -174,7 +172,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
@@ -190,8 +188,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
             scalar_t left;
             scratchAccessor.template get<scalar_t, uint32_t>(bankedIndex,left);
             if (Exclusive)
@@ -242,8 +239,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             scan_local = reduction0(scan_local);
             if (Config::electLast())
             {
-                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
                 scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -261,7 +257,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             lv1_val = reduction1(lv1_val);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedStoreIndex(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(invocationIndex);
                 scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -276,7 +272,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
-            scratchAccessor.template set<scalar_t, uint32_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
+            if (Config::electLast())
+                scratchAccessor.template set<scalar_t, uint32_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -315,8 +312,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (Config::electLast())
             {
-                const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
                 scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -331,7 +327,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
@@ -339,7 +335,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
                 scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -353,7 +349,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(prevIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i)-1,lv2_val[i]);
             lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
@@ -371,7 +367,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 scratchAccessor.template get<scalar_t, uint32_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
 
             scalar_t lv2_scan;
-            const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+            const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
             scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
@@ -386,8 +382,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
             scalar_t left;
             scratchAccessor.template get<scalar_t, uint32_t>(bankedIndex,left);
             if (Exclusive)

From add176bb73b9e7b5a643ac15962b7c74ff754e92 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 27 May 2025 12:54:24 +0200
Subject: [PATCH 223/346] update NSC image creation to not violate Microsoft
 EULA, update .github/workflows/build-nabla.yml

---
 .github/workflows/build-nabla.yml |  5 +-
 tools/nsc/CMakeLists.txt          | 76 +++++++++++++++++++------------
 2 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index a194734472..de1194d34b 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -20,7 +20,10 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        vendor: [msvc, clangcl]
+        # vendor: [msvc, clangcl]
+        # TODO: Yas please fix ClangCL, we have a few new compile errors
+        # if we build MSVC then build "run-compiler-explorer" target, for ClangCL build just "nsc"
+        vendor: [msvc]
         config: [Release, Debug, RelWithDebInfo]
         tag: ['17.13.6']
 
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index b0fec5b7f2..0fad4987be 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -73,12 +73,11 @@ find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED)
 cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR)
 cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR)
 
-include(InstallRequiredSystemLibraries)
-
-if(NOT MSVC_REDIST_DIR)
-  if(MSVC_REDIST_BASE) # fallback to our CI toolset
-    set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}")
-  else()
+if(MSVC_REDIST_BASE) # fallback to our toolset
+  set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}")
+else()
+  include(InstallRequiredSystemLibraries)
+  if(NOT MSVC_REDIST_DIR)
     message(FATAL_ERROR "Could not find MSVC_REDIST_DIR, define yourself!")
   endif()
 endif()
@@ -111,7 +110,6 @@ cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_N
 cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
 cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY)
 
-set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022)
 string(CONFIGURE [=[
 # syntax=docker/dockerfile:1
 # escape=`
@@ -119,8 +117,6 @@ string(CONFIGURE [=[
 # ---------------- COMPRESS STEP ----------------
 FROM @BASE_IMAGE@ as compress
 
-COPY --link --from=@CORE_IMAGE@ C:/Windows/System32/icu.dll C:/pack/Windows/System32/
-COPY --link --from=@CORE_IMAGE@ C:/Windows/Globalization/ICU/ C:/pack/Windows/Globalization/ICU/
 COPY --link Runtimes/ C:/pack/Windows/System32/
 COPY --link Nabla/ C:/pack/runtimes/Nabla/
 
@@ -140,11 +136,14 @@ COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"]
 COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties
 
 ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ `
-NBL_EXPLICIT_MODULE_LOAD_LOG=ON `
-ICU_DATA=C:\Windows\Globalization\ICU
+NBL_EXPLICIT_MODULE_LOAD_LOG=ON
 
 WORKDIR C:/Compiler-Explorer
-ENTRYPOINT ["C:\\unpack.bat", "&&", "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"]
+ENTRYPOINT [ `
+  "C:\\unpack.bat", "&&", `
+  "copy", "C:\\mount\\Windows\\System32\\icu.dll", "C:\\Windows\\System32\\icu.dll", "&&", `
+  "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl" `
+]
 ]=] INSTRUCTIONS @ONLY)
 
 set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile")
@@ -197,16 +196,28 @@ execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WH
 string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}")
 PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION)
 
-if(USE_PROCESS_ISOLATION)
-  set(ISOLATION "--isolation process")
-else()
+if(NOT USE_PROCESS_ISOLATION)
   # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump
   # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies
   # to collect *all* missing deps and copy (FROM at least server core) to destination nano
   # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs
+  # BUT it means violating EULA, hence we are not going to support it, also (**)
   message(FATAL_ERROR "HyperV is NOT supported! Update your OS!")
 endif()
 
+set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022)
+set(ICU_DIR C:\\Windows\\Globalization\\ICU)
+set(ICU_DLL C:\\Windows\\System32\\icu.dll)
+if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL})
+  # fallback for CI purposes, NOTE: we do NOT distribute those in final image as we have host runner requirements (**)
+  message(STATUS "\"${ICU_DIR}\" or \"${ICU_DLL}\ not found, fallback: copying them to the runner from \"${CORE_IMAGE}\"")
+  execute_process(COMMAND "${DOCKER_EXE}" rm -f nano-orphan RESULT_VARIABLE res)
+  execute_process(COMMAND "${DOCKER_EXE}" run -di --isolation process --name nano-orphan --entrypoint cmd ${CORE_IMAGE} COMMAND_ERROR_IS_FATAL ANY)
+  execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DIR} ${ICU_DIR} COMMAND_ERROR_IS_FATAL ANY)
+  execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DLL} ${ICU_DLL} COMMAND_ERROR_IS_FATAL ANY)
+  message(STATUS "Fallback completed, runner patched!")
+endif()
+
 set(ORPHAN nsc-orphan)
 set(NBL_CE_URL http://${ORPHAN}:10240)
 set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py")
@@ -215,14 +226,14 @@ set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl
 
 # to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory
 string(CONFIGURE [=[
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Killing remaining NSC orphans")
-execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}")
+message(STATUS "Killing remaining NSC orphans")
+execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}" RESULT_VARIABLE res)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Executing CTests")
+message(STATUS "Executing CTests")
 execute_process(COMMAND "${CTEST_EXE}" -C "$<CONFIG>" --stop-on-failure WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" 
                 COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC build info")
+message(STATUS "Generating NSC build info")
 execute_process(COMMAND "${CMAKE_COMMAND}"
                         "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
                         "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
@@ -231,7 +242,7 @@ execute_process(COMMAND "${CMAKE_COMMAND}"
                         -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
                         COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC godbolt config")
+message(STATUS "Generating NSC godbolt config")
 execute_process(COMMAND "${CMAKE_COMMAND}"
                         "-DSPIRV_DIS_EXE=spirv-dis.exe"
                         "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
@@ -241,37 +252,44 @@ execute_process(COMMAND "${CMAKE_COMMAND}"
                         -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
                         COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Updating NSC package context")
+message(STATUS "Updating NSC package context")
 execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different
                          "$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}>"
                          "${NBL_DOCKER_CTX_DIR}/Nabla"
                          COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Building NSC Godbolt image")
-execute_process(COMMAND "${DOCKER_EXE}" build ${ISOLATION}
+message(STATUS "Building NSC Godbolt image")
+execute_process(COMMAND "${DOCKER_EXE}" build --isolation process
                          -f "${DOCKERFILE}"
                          -t ${NSC_IMAGE_NAME}
                          "${NBL_DOCKER_CTX_DIR}"
                          COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Running new NSC orphan container")
-execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION}
-                         --name "${ORPHAN}" ${NSC_IMAGE_NAME}
+message(STATUS "Running new NSC orphan container")
+execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --isolation process
+                         --name "${ORPHAN}"
+                         -v $<PATH:NORMAL_PATH,${ICU_DIR}:${ICU_DIR}:ro>
+                         -v $<PATH:NORMAL_PATH,C:/Windows/System32:C:/mount/Windows/System32:ro>
+                         ${NSC_IMAGE_NAME}
                          COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Health‐check")
+message(STATUS "Healthy check")
 execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}"
                          --url "${NBL_CE_URL}" --interval 5 --ticks 12
                          COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Post‐Checking basic shader compile")
+message(STATUS "Post Basic NSC shader compile check")
 execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}"
                          --url "${NBL_CE_URL}"
                          --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile
                          --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}"
                          COMMAND_ERROR_IS_FATAL ANY)
 
-execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "OK! NSC container is healthy.")
+message(STATUS "Printing NSC container logs")
+execute_process(COMMAND "${DOCKER_EXE}" logs "${ORPHAN}" COMMAND_ERROR_IS_FATAL ANY)
+
+message(STATUS "OK! NSC container is healthy.")
+message(STATUS "Type \"localhost\" in your browser to use NSC with Godbolt!")
 ]=] INSTRUCTIONS)
 
 file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake" CONTENT "${INSTRUCTIONS}")

From c6d23bd2adbf9e1d9dfef08213cb16f44581e364 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 27 May 2025 13:30:32 +0200
Subject: [PATCH 224/346] mount named pipeline and use as docker host, update
 .github/workflows/build-nabla.yml

---
 .github/workflows/build-nabla.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index de1194d34b..94263a89e8 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -68,6 +68,7 @@ jobs:
             --env-file .\docker\ninja.env `
             --name orphan `
             -v "${{ github.workspace }}:${{ env.mount }}" `
+            -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
             -w "${{ env.mount }}" `
             "${{ env.image }}:${{ matrix.tag }}" `
             ${{ env.cmd }}

From 68095dac5c6095f6209e9a915553ac6a0dc424e5 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 27 May 2025 19:42:41 +0700
Subject: [PATCH 225/346] Fix error in ILogicalDevice.cpp due to removed
 getShaders method

---
 src/nbl/video/ILogicalDevice.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 62e364a71a..d9e1479d2e 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -862,7 +862,7 @@ bool ILogicalDevice::createGraphicsPipelines(
     core::vector<IGPUGraphicsPipeline::SCreationParams> newParams(params.begin(), params.end());
     const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
     {
-        return sum + param.getShaders().size();
+        return sum + param.getShaderCount();
     });
     core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
     debloatedShaders.reserve(shaderCount);

From 98e17598f15b50e8b82f1e2eeb02e88dec1d4a2f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 27 May 2025 19:43:04 +0700
Subject: [PATCH 226/346] Fix all errors in CVulkanLogicalDevice

---
 src/nbl/video/CVulkanLogicalDevice.cpp | 140 ++++++++++++++-----------
 src/nbl/video/CVulkanLogicalDevice.h   |   6 +-
 2 files changed, 82 insertions(+), 64 deletions(-)

diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 792ab719eb..6050b7a7a0 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1035,7 +1035,9 @@ core::smart_refctd_ptr<IGPUFramebuffer> CVulkanLogicalDevice::createFramebuffer_
 
 // TODO: Change this to pass SPIR-V directly!
 VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
-    const asset::IPipelineBase::SShaderSpecInfo& specInfo,
+    const video::IGPUPipelineBase::SShaderSpecInfo& specInfo,
+    hlsl::ShaderStage stage,
+    bool requireFullSubgroups,
     VkShaderModuleCreateInfo* &outShaderModule,
     std::string* &outEntryPoints,
     VkPipelineShaderStageRequiredSubgroupSizeCreateInfo* &outRequiredSubgroupSize,
@@ -1054,8 +1056,6 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
         // TODO: VkShaderModuleValidationCacheCreateInfoEXT from VK_EXT_validation_cache
         // TODO: VkPipelineRobustnessCreateInfoEXT from VK_EXT_pipeline_robustness (allows per-pipeline control of robustness)
 
-        const auto stage = specInfo.stage;
-
         (*outEntryPoints) = specInfo.entryPoint;
         const auto entryPointName = outEntryPoints->c_str();
         outEntryPoints++;
@@ -1076,8 +1076,8 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
             {
                 outSpecMapEntry->constantID = entry.first;
                 outSpecMapEntry->offset = std::distance<const uint8_t*>(specDataBegin,outSpecData);
-                outSpecMapEntry->size = entry.second.size;
-                memcpy(outSpecData,entry.second.data,outSpecMapEntry->size);
+                outSpecMapEntry->size = entry.second.size();
+                memcpy(outSpecData, entry.second.data(), outSpecMapEntry->size);
                 outSpecData += outSpecMapEntry->size;
                 outSpecMapEntry++;
             }
@@ -1098,7 +1098,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
         outShaderModule++;
 
         // Implicit: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
-        using subgroup_size_t = std::remove_reference_t<decltype(specInfo)>::SUBGROUP_SIZE;
+        using subgroup_size_t = asset::IPipelineBase::SUBGROUP_SIZE;
         if (specInfo.requiredSubgroupSize>=subgroup_size_t::REQUIRE_4)
         {
             *ppNext = outRequiredSubgroupSize;
@@ -1110,7 +1110,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
         else
             retval.flags = 0;
 
-        if (specInfo.requireFullSubgroups)
+        if (requireFullSubgroups)
         {
             assert(stage==hlsl::ShaderStage::ESS_COMPUTE/*TODO: Or Mesh Or Task*/);
             retval.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT;
@@ -1141,7 +1141,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPUComputePipeline::SCreationParams> createInfos,
     core::smart_refctd_ptr<IGPUComputePipeline>* const output,
-    const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation
+    const SSpecializationValidationResult& validation
 )
 {
     const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast<const CVulkanPipelineCache*>(pipelineCache)->getInternalObject():VK_NULL_HANDLE;
@@ -1168,7 +1168,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
     {
         initPipelineCreateInfo(outCreateInfo,info);
         const auto& spec = info.shader;
-        outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
+        outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, hlsl::ShaderStage::ESS_COMPUTE, info.cached.requireFullSubgroups, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
         outCreateInfo++;
     }
     auto vk_pipelines = reinterpret_cast<VkPipeline*>(output);
@@ -1187,7 +1187,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
             );
             debugNameBuilder.str("");
             const auto& specInfo = createInfos[i].shader;
-            debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << specInfo.stage << ")\n";
+            debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << hlsl::ShaderStage::ESS_COMPUTE << ")\n";
         }
     }
     else
@@ -1198,7 +1198,7 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPUGraphicsPipeline::SCreationParams> createInfos,
     core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-    const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation
+    const SSpecializationValidationResult& validation
 )
 {
     auto getVkStencilOpStateFrom = [](const asset::SStencilOpParams& params)->VkStencilOpState
@@ -1300,14 +1300,20 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     {
         initPipelineCreateInfo(outCreateInfo,info);
         outCreateInfo->pStages = outShaderStage;
-        for (const auto& spec : info.shaders)
+        auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage)
         {
             if (spec.shader)
             {
-                *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
-                outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
+              *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
+              outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
             }
-        }
+        };
+        processSpecShader(info.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
+        processSpecShader(info.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
+        processSpecShader(info.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
+        processSpecShader(info.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
+        processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
+
         // when dealing with mesh shaders, the vertex input and assembly state will be null
         {
             {
@@ -1342,17 +1348,13 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
             }
             outCreateInfo->pInputAssemblyState = outInputAssembly++;
         }
-        for (const auto& spec : info.shaders)
-        if (spec.shader)
+
+        if (info.tesselationControlShader.shader || info.tesselationEvaluationShader.shader)
         {
-            const auto stage = spec.stage;
-            if (stage==hlsl::ShaderStage::ESS_TESSELLATION_CONTROL || stage==hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)
-            {
-                outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount;
-                outCreateInfo->pTessellationState = outTessellation++;
-                break;
-            }
+            outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount;
+            outCreateInfo->pTessellationState = outTessellation++;
         }
+
         const auto& raster = info.cached.rasterization;
         {
             outViewport->viewportCount = raster.viewportCount;
@@ -1432,16 +1434,22 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     {
         for (size_t i=0ull; i<createInfos.size(); ++i)
         {
+            const auto& createInfo = createInfos[i];
             const VkPipeline vk_pipeline = vk_pipelines[i];
             // break the lifetime cause of the aliasing
             std::uninitialized_default_construct_n(output+i,1);
             output[i] = core::make_smart_refctd_ptr<CVulkanGraphicsPipeline>(createInfos[i],vk_pipeline);
             debugNameBuilder.str("");
-            for (const auto& shader: createInfos[i].shaders)
+            auto buildDebugName = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage stage)
             {
-                if (shader.shader != nullptr)
-                  debugNameBuilder <<shader.shader->getFilepathHint() << "(" << shader.entryPoint << "," << shader.stage << ")\n";
-            }
+                if (spec.shader != nullptr)
+                  debugNameBuilder <<spec.shader->getFilepathHint() << "(" << spec.entryPoint << "," << stage << ")\n";
+            };
+            buildDebugName(createInfo.vertexShader, hlsl::ESS_VERTEX);
+            buildDebugName(createInfo.tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL);
+            buildDebugName(createInfo.tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION);
+            buildDebugName(createInfo.geometryShader, hlsl::ESS_GEOMETRY);
+            buildDebugName(createInfo.fragmentShader, hlsl::ESS_FRAGMENT);
             output[i]->setObjectDebugName(debugNameBuilder.str().c_str());
         }
     }
@@ -1453,12 +1461,11 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPURayTracingPipeline::SCreationParams> createInfos,
     core::smart_refctd_ptr<IGPURayTracingPipeline>* const output,
-    const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation
+    const SSpecializationValidationResult& validation
 )
 {
-    using SShaderGroupParams = asset::IRayTracingPipelineBase::SShaderGroupsParams;
-    using SGeneralShaderGroup = asset::IRayTracingPipelineBase::SGeneralShaderGroup;
-    using SHitShaderGroup = asset::IRayTracingPipelineBase::SHitShaderGroup;
+    using SShaderGroupParams = IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams;
+    using SHitShaderGroup = SShaderGroupParams::SHitGroup;
 
     const auto dynamicStates = std::array{ VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR };
     const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { 
@@ -1473,7 +1480,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
     
     size_t maxShaderStages = 0;
     for (const auto& info : createInfos)
-        maxShaderStages += info.shaders.size();
+        maxShaderStages += info.shaderGroups.getShaderCount();
     size_t maxShaderGroups = 0;
     for (const auto& info : createInfos)
         maxShaderGroups += info.shaderGroups.getShaderGroupCount();
@@ -1498,40 +1505,51 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
     auto outSpecInfo = vk_specializationInfos.data();
     auto outSpecMapEntry = vk_specializationMapEntry.data();
     auto outSpecData = specializationData.data();
-    auto getVkShaderIndex = [](uint32_t index) { return index == SShaderGroupParams::SIndex::Unused ? VK_SHADER_UNUSED_KHR : index;  };
-    auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SGeneralShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR
+
+    for (const auto& info : createInfos)
     {
-        return {
-            .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
-            .pNext = nullptr,
-            .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
-            .generalShader = getVkShaderIndex(group.index),
-            .closestHitShader = VK_SHADER_UNUSED_KHR,
-            .anyHitShader = VK_SHADER_UNUSED_KHR,
-            .intersectionShader = VK_SHADER_UNUSED_KHR,
+        core::unordered_map<const asset::IShader*, uint32_t> shaderIndexes;
+        auto getVkShaderIndex = [&](const asset::IShader* shader)
+          { return shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader];  };
+
+        auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](IGPUPipelineBase::SShaderSpecInfo spec) -> VkRayTracingShaderGroupCreateInfoKHR
+        {
+            return {
+                .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
+                .pNext = nullptr,
+                .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
+                .generalShader = getVkShaderIndex(spec.shader),
+                .closestHitShader = VK_SHADER_UNUSED_KHR,
+                .anyHitShader = VK_SHADER_UNUSED_KHR,
+                .intersectionShader = VK_SHADER_UNUSED_KHR,
+            };
         };
-    };
-    auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR
-    {
-        return  {
-            .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
-            .pNext = nullptr,
-            .type = group.intersection == SShaderGroupParams::SIndex::Unused ? 
-              VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR,
-            .generalShader = VK_SHADER_UNUSED_KHR,
-            .closestHitShader = getVkShaderIndex(group.closestHit),
-            .anyHitShader = getVkShaderIndex(group.anyHit),
-            .intersectionShader = getVkShaderIndex(group.intersection),
+        auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR
+        {
+            return  {
+                .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
+                .pNext = nullptr,
+                .type = group.intersection.shader == nullptr ? 
+                  VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR,
+                .generalShader = VK_SHADER_UNUSED_KHR,
+                .closestHitShader = getVkShaderIndex(group.closestHit.shader),
+                .anyHitShader = getVkShaderIndex(group.anyHit.shader),
+                .intersectionShader = getVkShaderIndex(group.intersection.shader),
+            };
         };
-    };
-    for (const auto& info : createInfos)
-    {
+
         initPipelineCreateInfo(outCreateInfo,info);
         outCreateInfo->pStages = outShaderStage;
-        for (const auto& specInfo : info.shaders)
+        auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage)
         {
-            *(outShaderStage++) = getVkShaderStageCreateInfoFrom(specInfo, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
-        }
+            if (!spec.shader) return;
+            if (shaderIndexes.find(spec.shader) == shaderIndexes.end())
+            {
+                shaderIndexes.insert({ spec.shader, static_cast<uint32_t>(std::distance(outShaderStage, vk_shaderStage.data()))});
+                *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
+            }
+        };
+        processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN);
         outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages,outShaderStage);
         assert(outCreateInfo->stageCount != 0);
 
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 93d45dcc32..f5cda084c5 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -289,20 +289,20 @@ class CVulkanLogicalDevice final : public ILogicalDevice
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUComputePipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPUComputePipeline>* const output,
-            const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) override;
         void createGraphicsPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-            const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) override;
 
         void createRayTracingPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPURayTracingPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPURayTracingPipeline>* const output,
-            const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) override;
 
         // queries

From 59ccb2240ac7e80ca752b5efc0cd254913e468f6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 27 May 2025 19:43:19 +0700
Subject: [PATCH 227/346] Add get shader count for creationParams

---
 include/nbl/video/IGPUGraphicsPipeline.h   | 11 +++++++++++
 include/nbl/video/IGPURayTracingPipeline.h |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index c44ef5ceb1..806ee337c3 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -78,6 +78,17 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
 
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
+
+            inline uint32_t getShaderCount() const
+            {
+                uint32_t count = 0;
+                count += (vertexShader.shader != nullptr);
+                count += (tesselationControlShader.shader != nullptr);
+                count += (tesselationEvaluationShader.shader != nullptr);
+                count += (geometryShader.shader != nullptr);
+                count += (fragmentShader.shader != nullptr);
+                return count;
+            }
         };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const {return m_flags;}
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index beaecd772a..3bcd4537f3 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -66,6 +66,11 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                     return count;
                 }
 
+                inline uint32_t getShaderCount() const
+                {
+                    return getMissShaderCount() + getHitShaderCount() + getCallableShaderCount();
+                }
+
             };
 
             IGPUPipelineLayout* layout = nullptr;

From 9bd8682dc985d29289cd8387a24bc1672bc6e4b0 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 27 May 2025 14:50:09 +0200
Subject: [PATCH 228/346] update validation of kernel version & promote to
 process logic

---
 tools/nsc/CMakeLists.txt | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 0fad4987be..11b78ab4a3 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -157,29 +157,34 @@ set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.prop
 string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
 set(OUTPUT_CONFIG_FILE $<PATH:NORMAL_PATH,${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}>)
 
-function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR)
+function(PROMOTE_PROCESS_ISOLATION BASE VAR)
     set(${VAR} True)
     
     macro(INSPECT IMAGE)
-      execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE}
-        RESULT_VARIABLE EXIT_LEVEL 
+      execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} 
+        RESULT_VARIABLE INSPECTION_OK 
         OUTPUT_VARIABLE TARGET_KERNEL 
         OUTPUT_STRIP_TRAILING_WHITESPACE
       )
     endmacro()
 
     macro(TO_PROCESS IMAGE TARGET_KERNEL)
-      if(${HOST_KERNEL} VERSION_LESS ${TARGET_KERNEL})
-          set(${VAR} False)
-          message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation with \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.")
+      execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K 
+        RESULT_VARIABLE PROCESS_ISOLATION_OK 
+        OUTPUT_QUIET ERROR_QUIET
+      )
+
+      if(${PROCESS_ISOLATION_OK} EQUAL 0)
+          message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation")
       else()
-        message(STATUS "\"${IMAGE}\" [${TARGET_KERNEL}] can be promoted to process isolation with host kernel [${HOST_KERNEL}] version")
+          set(${VAR} False)
+          message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.")
       endif()
     endmacro()
 
     INSPECT(${BASE})
 
-    if(${EXIT_LEVEL} EQUAL 0)
+    if(${INSPECTION_OK} EQUAL 0)
       TO_PROCESS(${BASE} ${TARGET_KERNEL})
     else()
       message(STATUS "\"${BASE}\" not found in local registry, pulling...")
@@ -192,9 +197,7 @@ function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR)
     set(${VAR} ${${VAR}} PARENT_SCOPE)
 endfunction()
 
-execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE)
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}")
-PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION)
+PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION)
 
 if(NOT USE_PROCESS_ISOLATION)
   # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump

From bc9befbced6d8489ddc09b84b97ba63207a86985 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 27 May 2025 20:05:53 +0700
Subject: [PATCH 229/346] Move shader stage validation out of
 commonCreatePipelines

---
 include/nbl/video/ILogicalDevice.h | 69 +-----------------------------
 src/nbl/video/ILogicalDevice.cpp   | 59 ++++++++++++++++---------
 2 files changed, 41 insertions(+), 87 deletions(-)

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index ab0d5bea06..0ad882a71e 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -1096,8 +1096,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual core::smart_refctd_ptr<IGPURenderpass> createRenderpass_impl(const IGPURenderpass::SCreationParams& params, IGPURenderpass::SCreationParamValidationResult&& validation) = 0;
         virtual core::smart_refctd_ptr<IGPUFramebuffer> createFramebuffer_impl(IGPUFramebuffer::SCreationParams&& params) = 0;
 
-        template<typename CreationParams, typename ExtraLambda>
-        inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const CreationParams> params, ExtraLambda&& extra)
+        template<typename CreationParams>
+        inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const CreationParams> params)
         {
             if (pipelineCache && !pipelineCache->wasCreatedBy(this))
             {
@@ -1149,71 +1149,6 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     return {};
                 }
 
-                const auto& features = getEnabledFeatures();
-                for (auto info : ci.getShaders())
-                if (info.shader)
-                {
-                    const asset::IShader::E_SHADER_STAGE shaderStage = info.stage;
-
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-02091
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-02092
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00706
-                    switch (shaderStage)
-                    {
-                        case hlsl::ShaderStage::ESS_TESSELLATION_CONTROL: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION:
-                            if (!features.tessellationShader)
-                            {
-                                NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", info.shader);
-                                return {};
-                            }
-                            break;
-                        case hlsl::ShaderStage::ESS_GEOMETRY:
-                            if (!features.geometryShader)
-                            {
-                                NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", info.shader);
-                                return {};
-                            }
-                            break;
-                        case hlsl::ShaderStage::ESS_ALL_OR_LIBRARY: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_VERTEX: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_FRAGMENT: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_COMPUTE:
-                            break;
-                            // unsupported yet
-                        case hlsl::ShaderStage::ESS_TASK: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_MESH:
-                            NBL_LOG_ERROR("Unsupported (yet) shader stage");
-                            return {};
-                            break;
-                        case hlsl::ShaderStage::ESS_RAYGEN: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_ANY_HIT: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_CLOSEST_HIT: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_MISS: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_INTERSECTION: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_CALLABLE:
-                            if (!features.rayTracingPipeline)
-                            {
-                                NBL_LOG_ERROR("Cannot create IGPUShader for %p, Raytracing Pipeline feature not enabled!", info.shader);
-                                return {};
-                            }
-                            break;
-                        default:
-                            // Implicit unsupported stages or weird multi-bit stage enum values
-                            NBL_LOG_ERROR("Unknown Shader Stage %d", shaderStage);
-                            return {};
-                            break;
-                    }
-
-                    if (!extra(info))
-                    {
-                        NBL_LOG_ERROR("Invalid shader were specified (params[%d])", i);
-                        return {};
-                    }
-                }
-
                 retval += validation;
             }
             return retval;
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index d9e1479d2e..c019be84a7 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -788,16 +788,8 @@ asset::ICPUPipelineCache::SCacheKey ILogicalDevice::getPipelineCacheKey() const
 bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const IGPUComputePipeline::SCreationParams> params, core::smart_refctd_ptr<IGPUComputePipeline>* const output)
 {
     std::fill_n(output,params.size(),nullptr);
-    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool
-    {
-        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
-        if (info.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE))
-        {
-            NBL_LOG_ERROR("Invalid shader stage");
-            return false;
-        }
-        return true;
-    });
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache, params);
+
     if (!specConstantValidation)
     {
         NBL_LOG_ERROR("Invalid parameters were given");
@@ -815,6 +807,14 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
+
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
+        if (ci.shader.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE))
+        {
+            NBL_LOG_ERROR("Invalid shader stage");
+            return false;
+        }
+
         const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} };
         debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger));
         auto debloatedShaderSpec = ci.shader;
@@ -845,12 +845,7 @@ bool ILogicalDevice::createGraphicsPipelines(
 )
 {
     std::fill_n(output, params.size(), nullptr);
-    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params,
-        [this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool
-        {
-            return info.shader != nullptr;
-        }
-    );
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params);
     if (!specConstantValidation)
     {
         NBL_LOG_ERROR("Invalid parameters were given");
@@ -870,6 +865,27 @@ bool ILogicalDevice::createGraphicsPipelines(
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
+
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705
+        if (ci.tesselationControlShader.shader)
+        {
+            NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationControlShader.shader);
+            return false;
+        }
+
+        if (ci.tesselationEvaluationShader.shader)
+        {
+            NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationEvaluationShader.shader);
+            return false;
+        }
+
+        if (ci.geometryShader.shader)
+        {
+            NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", ci.geometryShader.shader);
+            return false;
+        }
+        
         auto renderpass = ci.renderpass;
         if (!renderpass->wasCreatedBy(this))
         {
@@ -996,10 +1012,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
   core::smart_refctd_ptr<IGPURayTracingPipeline>* const output)
 {
     std::fill_n(output,params.size(),nullptr);
-    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool
-    {
-        return true;
-    });
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params);
     if (!specConstantValidation)
     {
         NBL_LOG_ERROR("Invalid parameters were given");
@@ -1020,6 +1033,12 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
         const bool skipAABBs = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_AABBS);
         const bool skipBuiltin = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_BUILT_IN_PRIMITIVES);
 
+        if (!features.rayTracingPipeline)
+        {
+            NBL_LOG_ERROR("Raytracing Pipeline feature not enabled!");
+            return {};
+        }
+
         // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-rayTraversalPrimitiveCulling-03597
         if (skipAABBs && skipBuiltin)
         {

From 3f5708e5f6abd295d8be64f4f9135dcab80f1741 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 27 May 2025 17:11:43 +0200
Subject: [PATCH 230/346] let override publish CE port & control its URL
 depending on NBL_DOCKER_DIND_BUILD, update CMakePresets.json

---
 CMakePresets.json        |  4 +++-
 tools/nsc/CMakeLists.txt | 18 ++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 359ec6fb02..ae56cf1739 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -19,7 +19,9 @@
 				"NBL_EXPLICIT_MODULE_LOAD_LOG": "ON",
 				"NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON",
 				"NBL_CPACK_CI": "ON",
-				"GIT_FAIL_IF_NONZERO_EXIT": "OFF"
+				"GIT_FAIL_IF_NONZERO_EXIT": "OFF",
+				"NBL_DOCKER_DIND_BUILD": "ON",
+				"NBL_CE_PUBLISH_PORT": "10240"
 			}
 		},
 		{
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 11b78ab4a3..55db4ce14a 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -222,7 +222,17 @@ if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL})
 endif()
 
 set(ORPHAN nsc-orphan)
-set(NBL_CE_URL http://${ORPHAN}:10240)
+
+if(NOT DEFINED NBL_CE_PUBLISH_PORT)
+  set(NBL_CE_PUBLISH_PORT 80)
+endif()
+
+if(NBL_DOCKER_DIND_BUILD)
+  set(NBL_CE_URL http://${ORPHAN}:${NBL_CE_PUBLISH_PORT})
+else()
+  set(NBL_CE_URL http://localhost:${NBL_CE_PUBLISH_PORT})
+endif()
+
 set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py")
 set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py")
 set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json")
@@ -269,8 +279,8 @@ execute_process(COMMAND "${DOCKER_EXE}" build --isolation process
                          COMMAND_ERROR_IS_FATAL ANY)
 
 message(STATUS "Running new NSC orphan container")
-execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --isolation process
-                         --name "${ORPHAN}"
+execute_process(COMMAND "${DOCKER_EXE}" run -di -p ${NBL_CE_PUBLISH_PORT}:10240 --isolation process
+                         --name "${ORPHAN}" --network docker_default
                          -v $<PATH:NORMAL_PATH,${ICU_DIR}:${ICU_DIR}:ro>
                          -v $<PATH:NORMAL_PATH,C:/Windows/System32:C:/mount/Windows/System32:ro>
                          ${NSC_IMAGE_NAME}
@@ -292,7 +302,7 @@ message(STATUS "Printing NSC container logs")
 execute_process(COMMAND "${DOCKER_EXE}" logs "${ORPHAN}" COMMAND_ERROR_IS_FATAL ANY)
 
 message(STATUS "OK! NSC container is healthy.")
-message(STATUS "Type \"localhost\" in your browser to use NSC with Godbolt!")
+message(STATUS "Type \"${NBL_CE_URL}\" in your browser to use NSC with Godbolt!")
 ]=] INSTRUCTIONS)
 
 file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake" CONTENT "${INSTRUCTIONS}")

From b81fb12b95561e9c6822b20ddac6d02d7fc4ee23 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 27 May 2025 17:16:48 +0200
Subject: [PATCH 231/346] and specify network for builder, update
 .github/workflows/build-nabla.yml

---
 .github/workflows/build-nabla.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 94263a89e8..30f1156096 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -66,7 +66,7 @@ jobs:
             --entrypoint ${{ env.entry }} -di --isolation process `
             --env-file .\docker\ci-windows.env `
             --env-file .\docker\ninja.env `
-            --name orphan `
+            --name orphan --network docker_default `
             -v "${{ github.workspace }}:${{ env.mount }}" `
             -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
             -w "${{ env.mount }}" `

From fcbfa5c56380414582ae381c5cad3f04028f34dc Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 27 May 2025 17:31:18 +0200
Subject: [PATCH 232/346] add "create default network" step to actions

---
 .github/workflows/build-nabla.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 30f1156096..d079b8bcaf 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -55,6 +55,13 @@ jobs:
         run: |
           docker pull "${{ env.image }}:${{ matrix.tag }}"
 
+      - name: Create default network
+        run: |
+          if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
+            docker network create --driver nat docker_default
+            if ($LASTEXITCODE -ne 0) { exit 1 }
+          }
+
       - name: Run Container
         run: |
           $ctx = docker context show

From 472aa0ba6f98bed8a8d3996bececb514e1473046 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 28 May 2025 10:50:00 +0700
Subject: [PATCH 233/346] more fixes to indexing

---
 .../hlsl/workgroup2/arithmetic_config.hlsl        | 15 +++++++++++++--
 .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl  |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index e02c74e80b..1587f919cc 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -46,6 +46,11 @@ struct ArithmeticConfiguration
     using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
+    static_assert(VirtualWorkgropupSize<=WorkgroupSize*SubgroupSize) 
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2;
+
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>;
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
@@ -74,10 +79,16 @@ struct ArithmeticConfiguration
     template<uint16_t level>
     static uint32_t sharedStoreIndex(const uint32_t subgroupID)
     {
+        uint32_t offsetBySubgroup;
+        if (level == LevelCount-1)
+            offsetBySubgroup = SubgroupSize;
+        else
+            offsetBySubgroup = __SubgroupsPerVirtualWorkgroup;
+
         if (level<2)
-            return (subgroupID & (ItemsPerInvocation_1-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_1);
+            return (subgroupID & (ItemsPerInvocation_1-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1);
         else
-            return (subgroupID & (ItemsPerInvocation_2-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_2);
+            return (subgroupID & (ItemsPerInvocation_2-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2);
     }
 
     template<uint16_t level>
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 418c3219f4..99238851eb 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -321,7 +321,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         // level 1 scan
         const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        if (glsl::gl_SubgroupID() < lv1_smem_size)
+        if (glsl::gl_SubgroupID() < Config::SubgroupsSize*Config::ItemsPerInvocation_2)
         {
             vector_lv1_t lv1_val;
             const uint32_t prevIndex = invocationIndex-1;

From c483941b09f804fada57491a4f69ffdb27518df2 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 28 May 2025 11:38:18 +0700
Subject: [PATCH 234/346] share level 0 scan between 2-level and 3-level scans
 (and reduce)

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    |  2 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 93 ++++++++-----------
 2 files changed, 40 insertions(+), 55 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 1587f919cc..75947ea97c 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -46,7 +46,7 @@ struct ArithmeticConfiguration
     using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
-    static_assert(VirtualWorkgropupSize<=WorkgroupSize*SubgroupSize) 
+    static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
 
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2;
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 99238851eb..195431c5d3 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -85,22 +85,17 @@ struct reduce<Config, BinOp, 2, device_capabilities>
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
     using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
-    template<class DataAccessor, class ScratchAccessor>
-    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    template<class DataAccessor, class ScratchAccessor, class Params, typename vector_t>
+    static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
-        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
-        BinOp binop;
-
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
-        subgroup2::reduction<params_lv0_t> reduction0;
+        subgroup2::reduction<Params> reduction0;
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            vector_lv0_t scan_local;
-            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            vector_t scan_local;
+            dataAccessor.template get<vector_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
             if (Config::electLast())
             {
@@ -109,7 +104,19 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    }
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
 
+        __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
+
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() == 0)
@@ -138,24 +145,19 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
     using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
-    template<class DataAccessor, class ScratchAccessor>
-    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    template<class DataAccessor, class ScratchAccessor, class Params, typename vector_t>
+    static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
-        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
-        BinOp binop;
-
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
-        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
+        subgroup2::inclusive_scan<Params> inclusiveScan0;
         // level 0 scan
         [unroll]
         for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
-            vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            vector_t value;
+            dataAccessor.template get<vector_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
-            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (Config::electLast())
             {
                 const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
@@ -163,7 +165,19 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    }
 
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
+
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() == 0)
@@ -228,23 +242,9 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
-        // level 0 scan
-        subgroup2::reduction<params_lv0_t> reduction0;
-        [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        {
-            vector_lv0_t scan_local;
-            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
-            scan_local = reduction0(scan_local);
-            if (Config::electLast())
-            {
-                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
-                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
-            }
-        }
-        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+        reduce<Config, BinOp, 2, device_capabilities>::template __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
 
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
         subgroup2::reduction<params_lv1_t> reduction1;
@@ -300,24 +300,9 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
-        subgroup2::inclusive_scan<params_lv0_t> inclusiveScan0;
-        // level 0 scan
-        [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        {
-            vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            value = inclusiveScan0(value);
-            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            if (Config::electLast())
-            {
-                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
-                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
-            }
-        }
-        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+        scan<Config, BinOp, Exclusive, 2, device_capabilities>::template __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
 
+        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;

From 951ff99bc2ab1be385010c06ca3ba8ad236f2b2c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 28 May 2025 12:11:14 +0700
Subject: [PATCH 235/346] reduce duplicate vars in config

---
 .../builtin/hlsl/workgroup2/arithmetic_config.hlsl    | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 75947ea97c..c0e105e700 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -15,20 +15,23 @@ namespace workgroup2
 
 namespace impl
 {
-template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
     static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
     static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16");
+
     NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, SubgroupSizeLog2*levels, WorkgroupSizeLog2>;
     // must have at least enough level 0 outputs to feed a single subgroup
 };
 
-template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
+template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation>
 struct items_per_invocation
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,VirtualWorkgroup::WorkgroupSizeLog2-VirtualWorkgroup::SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
@@ -51,7 +54,7 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2;
 
-    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>;
+    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;

From 1f64763acb7cb41c8cde1d5a65ca9316d3da34cb Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 10:06:25 +0200
Subject: [PATCH 236/346] add labeling to image creation steps, update actions
 to upload image workflow artifacts

---
 .github/workflows/build-nabla.yml |  34 +++--
 tools/nsc/CMakeLists.txt          | 241 +++++++++++++++++-------------
 2 files changed, 165 insertions(+), 110 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index d079b8bcaf..e15d1a5ab2 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -39,6 +39,11 @@ jobs:
             Set-MpPreference -DisableRemovableDriveScanning $true
             Set-MpPreference -DisableArchiveScanning $true
             Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
+
+            if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
+              docker network create --driver nat docker_default
+              if ($LASTEXITCODE -ne 0) { exit 1 }
+            }
     
       - name: Checkout
         uses: actions/checkout@v4
@@ -47,21 +52,17 @@ jobs:
 
       - name: Set prefix
         id: set-prefix
-        shell: bash
         run: |
-          echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> "$GITHUB_OUTPUT"
+          $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}"
+          $nscTargetTaggedImage = "ghcr.io/$env:GITHUB_REPOSITORY:nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}".ToLower()
+
+          "prefix=$prefix" >> $env:GITHUB_OUTPUT
+          "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT
 
       - name: Pull Image
         run: |
           docker pull "${{ env.image }}:${{ matrix.tag }}"
 
-      - name: Create default network
-        run: |
-          if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
-            docker network create --driver nat docker_default
-            if ($LASTEXITCODE -ne 0) { exit 1 }
-          }
-
       - name: Run Container
         run: |
           $ctx = docker context show
@@ -97,6 +98,7 @@ jobs:
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
+              -DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}
 
       - name: Container – Build NSC
         run: |
@@ -116,11 +118,23 @@ jobs:
               ${{ env.binary }} --config ${{ matrix.config }} `
               --component Executables --prefix ${{ env.install }}
 
-      - name: Package workflow artifacts
+      - name: Container – Save NSC Image
+        run: |
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} -Command docker `
+              save ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} | zstd -T0 -3 -f -o ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst
+
+      - name: Package left workflow artifacts
         run: |
           tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" profiling
           tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" ${{ env.install }}
 
+      - name: Upload NSC Godbolt Image artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image
+          path: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst
+
       - name: Upload profiling artifacts
         uses: actions/upload-artifact@v4
         with:
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 55db4ce14a..d3b8bdf94a 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -59,11 +59,82 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST
 
 if(NBL_ENABLE_DOCKER_INTEGRATION)
 
+find_program(DOCKER_EXE NAMES docker REQUIRED)
 set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022)
+set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022)
 
-find_program(CTEST_EXE NAMES ctest REQUIRED)
-find_program(DOCKER_EXE NAMES docker REQUIRED)
+function(PROMOTE_PROCESS_ISOLATION BASE VAR)
+    set(${VAR} True)
+    
+    macro(INSPECT IMAGE)
+      execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} 
+        RESULT_VARIABLE INSPECTION_OK 
+        OUTPUT_VARIABLE TARGET_KERNEL 
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+    endmacro()
+
+    macro(TO_PROCESS IMAGE TARGET_KERNEL)
+      execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K 
+        RESULT_VARIABLE PROCESS_ISOLATION_OK 
+        OUTPUT_QUIET ERROR_QUIET
+      )
+
+      if(${PROCESS_ISOLATION_OK} EQUAL 0)
+          message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation")
+      else()
+          set(${VAR} False)
+          message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.")
+      endif()
+    endmacro()
+
+    INSPECT(${BASE})
+
+    if(${INSPECTION_OK} EQUAL 0)
+      TO_PROCESS(${BASE} ${TARGET_KERNEL})
+    else()
+      message(STATUS "\"${BASE}\" not found in local registry, pulling...")
+      execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE})
+
+      INSPECT(${BASE})
+      TO_PROCESS(${BASE} ${TARGET_KERNEL})
+    endif()
+
+    set(${VAR} ${${VAR}} PARENT_SCOPE)
+endfunction()
+
+PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION)
+
+if(NOT USE_PROCESS_ISOLATION)
+  # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump
+  # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies
+  # to collect *all* missing deps and copy (FROM at least server core) to destination nano
+  # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs
+  # BUT it means violating EULA, hence we are not going to support it, also (**)
+  message(FATAL_ERROR "HyperV is NOT supported! Update your OS!")
+endif()
+
+function(GET_LABEL BASE_IMAGE LABEL VAR)
+    set(FORMAT "{{ index .Config.Labels \"${LABEL}\" }}")
+    execute_process(COMMAND ${DOCKER_EXE} inspect --format=${FORMAT} ${BASE_IMAGE}
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_VARIABLE ERR
+        RESULT_VARIABLE RES
+    )
 
+    if(NOT RES EQUAL 0)
+      message(WARNING "Could not get \"${LABEL}\" label from \"${BASE_IMAGE}\" image, it doesn't exist!")
+    endif()
+
+    set(${VAR} "${OUT}" PARENT_SCOPE)
+endfunction()
+
+GET_LABEL(${BASE_IMAGE} org.opencontainers.image.title ORG_LABEL_TITLE)
+GET_LABEL(${BASE_IMAGE} org.opencontainers.image.source ORG_LABEL_SOURCE)
+GET_LABEL(${BASE_IMAGE} org.opencontainers.image.description ORG_LABEL_DESCRIPTION)
+
+find_program(CTEST_EXE NAMES ctest REQUIRED)
 find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED)
 
 set(ICU_GLOBALIZATION_DIR C:\\Windows\\Globalization\\ICU)
@@ -144,6 +215,11 @@ ENTRYPOINT [ `
   "copy", "C:\\mount\\Windows\\System32\\icu.dll", "C:\\Windows\\System32\\icu.dll", "&&", `
   "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl" `
 ]
+
+LABEL org.opencontainers.image.title="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_TITLE@"
+LABEL org.opencontainers.image.source=https://github.com/Devsh-Graphics-Programming/Nabla
+LABEL org.opencontainers.image.description="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_DESCRIPTION@"
+
 ]=] INSTRUCTIONS @ONLY)
 
 set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile")
@@ -157,58 +233,6 @@ set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.prop
 string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
 set(OUTPUT_CONFIG_FILE $<PATH:NORMAL_PATH,${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}>)
 
-function(PROMOTE_PROCESS_ISOLATION BASE VAR)
-    set(${VAR} True)
-    
-    macro(INSPECT IMAGE)
-      execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} 
-        RESULT_VARIABLE INSPECTION_OK 
-        OUTPUT_VARIABLE TARGET_KERNEL 
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-      )
-    endmacro()
-
-    macro(TO_PROCESS IMAGE TARGET_KERNEL)
-      execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K 
-        RESULT_VARIABLE PROCESS_ISOLATION_OK 
-        OUTPUT_QUIET ERROR_QUIET
-      )
-
-      if(${PROCESS_ISOLATION_OK} EQUAL 0)
-          message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation")
-      else()
-          set(${VAR} False)
-          message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.")
-      endif()
-    endmacro()
-
-    INSPECT(${BASE})
-
-    if(${INSPECTION_OK} EQUAL 0)
-      TO_PROCESS(${BASE} ${TARGET_KERNEL})
-    else()
-      message(STATUS "\"${BASE}\" not found in local registry, pulling...")
-      execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE})
-
-      INSPECT(${BASE})
-      TO_PROCESS(${BASE} ${TARGET_KERNEL})
-    endif()
-
-    set(${VAR} ${${VAR}} PARENT_SCOPE)
-endfunction()
-
-PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION)
-
-if(NOT USE_PROCESS_ISOLATION)
-  # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump
-  # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies
-  # to collect *all* missing deps and copy (FROM at least server core) to destination nano
-  # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs
-  # BUT it means violating EULA, hence we are not going to support it, also (**)
-  message(FATAL_ERROR "HyperV is NOT supported! Update your OS!")
-endif()
-
-set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022)
 set(ICU_DIR C:\\Windows\\Globalization\\ICU)
 set(ICU_DLL C:\\Windows\\System32\\icu.dll)
 if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL})
@@ -240,75 +264,92 @@ set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl
 # to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory
 string(CONFIGURE [=[
 message(STATUS "Killing remaining NSC orphans")
-execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}" RESULT_VARIABLE res)
+execute_process(COMMAND "@DOCKER_EXE@" 
+  rm -f "@ORPHAN@" 
+  RESULT_VARIABLE res
+)
 
 message(STATUS "Executing CTests")
-execute_process(COMMAND "${CTEST_EXE}" -C "$<CONFIG>" --stop-on-failure WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" 
-                COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@CTEST_EXE@" 
+  -C "$<CONFIG>" --stop-on-failure 
+  WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@"
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Generating NSC build info")
-execute_process(COMMAND "${CMAKE_COMMAND}"
-                        "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
-                        "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-                        "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-                        "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>"
-                        -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
-                        COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@CMAKE_COMMAND@"
+  "-DNBL_EXECUTABLE_PATH=@NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH@"
+  "-DNBL_BUILD_INFO=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@"
+  "-DNBL_OUTPUT_FILE=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@"
+  "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,@NBL_DOCKER_CT_NSC_VOLUME_TARGET@/@NBL_PACKAGE_RUNTIME_EXE_DIR_PATH@/@NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME@>"
+  -P "@NBL_ROOT_PATH@/cmake/scripts/nbl/nablaBuildInfo.cmake"
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Generating NSC godbolt config")
-execute_process(COMMAND "${CMAKE_COMMAND}"
-                        "-DSPIRV_DIS_EXE=spirv-dis.exe"
-                        "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
-                        "-DNSC_RELWITHDEBINFO_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
-                        "-DNSC_DEBUG_BUILD_INFO=$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}>"
-                        "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}"
-                        -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
-                        COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@CMAKE_COMMAND@"
+  "-DSPIRV_DIS_EXE=spirv-dis.exe"
+  "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@/@NBL_RELATIVE_ENTRY@/@NBL_NSC_BUILD_INFO_FILENAME@>"
+  "-DNSC_RELWITHDEBINFO_BUILD_INFO=$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@/relwithdebinfo/@NBL_RELATIVE_ENTRY@/@NBL_NSC_BUILD_INFO_FILENAME@>"
+  "-DNSC_DEBUG_BUILD_INFO=$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@/debug/@NBL_RELATIVE_ENTRY@/@NBL_NSC_BUILD_INFO_FILENAME@>"
+  "-DOUTPUT_CONFIG_FILE=@OUTPUT_CONFIG_FILE@"
+  -P "@CMAKE_CURRENT_SOURCE_DIR@/ce-generate-config.cmake"
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Updating NSC package context")
-execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different
-                         "$<PATH:NORMAL_PATH,${NBL_NSC_PREINSTALL_DIRECTORY}>"
-                         "${NBL_DOCKER_CTX_DIR}/Nabla"
-                         COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@CMAKE_COMMAND@" -E copy_directory_if_different
+  "$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@>"
+  "@NBL_DOCKER_CTX_DIR@/Nabla"
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Building NSC Godbolt image")
-execute_process(COMMAND "${DOCKER_EXE}" build --isolation process
-                         -f "${DOCKERFILE}"
-                         -t ${NSC_IMAGE_NAME}
-                         "${NBL_DOCKER_CTX_DIR}"
-                         COMMAND_ERROR_IS_FATAL ANY)
+string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%dT%H:%M:%SZ" UTC)
+execute_process(COMMAND "@DOCKER_EXE@" build --isolation process
+  --label=org.opencontainers.image.created="${BUILD_TIMESTAMP}"
+  -f "@DOCKERFILE@" -t @NSC_IMAGE_NAME@ "@NBL_DOCKER_CTX_DIR@"
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Running new NSC orphan container")
-execute_process(COMMAND "${DOCKER_EXE}" run -di -p ${NBL_CE_PUBLISH_PORT}:10240 --isolation process
-                         --name "${ORPHAN}" --network docker_default
-                         -v $<PATH:NORMAL_PATH,${ICU_DIR}:${ICU_DIR}:ro>
-                         -v $<PATH:NORMAL_PATH,C:/Windows/System32:C:/mount/Windows/System32:ro>
-                         ${NSC_IMAGE_NAME}
-                         COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@DOCKER_EXE@" run -di -p @NBL_CE_PUBLISH_PORT@:10240 --isolation process
+  --name "@ORPHAN@" --network docker_default
+  -v $<PATH:NORMAL_PATH,@ICU_DIR@:@ICU_DIR@:ro>
+  -v $<PATH:NORMAL_PATH,C:/Windows/System32:C:/mount/Windows/System32:ro>
+  @NSC_IMAGE_NAME@
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Healthy check")
-execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}"
-                         --url "${NBL_CE_URL}" --interval 5 --ticks 12
-                         COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_HEALTHY_CHECK_PY@"
+  --url "@NBL_CE_URL@" --interval 5 --ticks 12
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Post Basic NSC shader compile check")
-execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}"
-                         --url "${NBL_CE_URL}"
-                         --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile
-                         --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}"
-                         COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_ENDPOINT_PY@"
+  --url "@NBL_CE_URL@"
+  --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile
+  --method POST --json "@NBL_NSC_BASIC_HLSL_JPAYLOAD@"
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "Printing NSC container logs")
-execute_process(COMMAND "${DOCKER_EXE}" logs "${ORPHAN}" COMMAND_ERROR_IS_FATAL ANY)
+execute_process(COMMAND "@DOCKER_EXE@" 
+  logs "@ORPHAN@" 
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
 message(STATUS "OK! NSC container is healthy.")
-message(STATUS "Type \"${NBL_CE_URL}\" in your browser to use NSC with Godbolt!")
-]=] INSTRUCTIONS)
+message(STATUS "Type \"@NBL_CE_URL@\" in your browser to use NSC with Godbolt!")
+]=] INSTRUCTIONS @ONLY)
 
-file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake" CONTENT "${INSTRUCTIONS}")
+set(SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake")
+file(GENERATE OUTPUT ${SCRIPT_FILE} CONTENT "${INSTRUCTIONS}")
 
 add_custom_target(run-compiler-explorer ALL
-    COMMAND "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake"
+    COMMAND "${CMAKE_COMMAND}" -P ${SCRIPT_FILE}
     VERBATIM
     COMMAND_EXPAND_LISTS
 )

From d5318514a57d0d25f7c1710cc079092a33516afd Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 10:36:32 +0200
Subject: [PATCH 237/346] correct passing vars in shell

---
 .github/workflows/build-nabla.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index e15d1a5ab2..d5d2a2b8d6 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -52,9 +52,12 @@ jobs:
 
       - name: Set prefix
         id: set-prefix
+        shell: pwsh
         run: |
           $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}"
-          $nscTargetTaggedImage = "ghcr.io/$env:GITHUB_REPOSITORY:nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}".ToLower()
+          $repo = $env:GITHUB_REPOSITORY
+          $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
+          $nscTargetTaggedImage = "ghcr.io/$repo:$tag".ToLower()
 
           "prefix=$prefix" >> $env:GITHUB_OUTPUT
           "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT
@@ -98,7 +101,7 @@ jobs:
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
-              -DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}
+              "-DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}"
 
       - name: Container – Build NSC
         run: |

From 2074c138b71e2af95242d030b4fce5742a313027 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 10:49:35 +0200
Subject: [PATCH 238/346] post fixes to actions, use ${} to delimit var name

---
 .github/workflows/build-nabla.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index d5d2a2b8d6..e022353652 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -44,11 +44,6 @@ jobs:
               docker network create --driver nat docker_default
               if ($LASTEXITCODE -ne 0) { exit 1 }
             }
-    
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: 'recursive'
 
       - name: Set prefix
         id: set-prefix
@@ -57,10 +52,15 @@ jobs:
           $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}"
           $repo = $env:GITHUB_REPOSITORY
           $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
-          $nscTargetTaggedImage = "ghcr.io/$repo:$tag".ToLower()
+          $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower()
 
           "prefix=$prefix" >> $env:GITHUB_OUTPUT
           "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT
+    
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
 
       - name: Pull Image
         run: |

From 353c46775da80c04678b8a732be9e1af7950b233 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 11:33:39 +0200
Subject: [PATCH 239/346] ahh typo!

---
 .github/workflows/build-nabla.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index e022353652..a7b0fe3e68 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -100,7 +100,7 @@ jobs:
             ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               --profiling-output=profiling/cmake-profiling.json `
-              --profiling-format=google-trace
+              --profiling-format=google-trace `
               "-DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}"
 
       - name: Container – Build NSC

From b8d53cccb83629454bbb48ab827e0172a03bb26f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 28 May 2025 16:48:27 +0700
Subject: [PATCH 240/346] Fix vulkan ray tracing creation

---
 include/nbl/video/CVulkanRayTracingPipeline.h |  5 ++-
 src/nbl/video/CVulkanComputePipeline.h        |  5 ++-
 src/nbl/video/CVulkanLogicalDevice.cpp        |  3 +-
 src/nbl/video/CVulkanRayTracingPipeline.cpp   | 33 ++++++++++---------
 src/nbl/video/ILogicalDevice.cpp              |  6 ++--
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/nbl/video/CVulkanRayTracingPipeline.h b/include/nbl/video/CVulkanRayTracingPipeline.h
index 82d8c777b6..a9bc476f43 100644
--- a/include/nbl/video/CVulkanRayTracingPipeline.h
+++ b/include/nbl/video/CVulkanRayTracingPipeline.h
@@ -41,10 +41,13 @@ class CVulkanRayTracingPipeline final : public IGPURayTracingPipeline
 
     const VkPipeline m_vkPipeline;
     ShaderGroupHandleContainer m_shaderGroupHandles;
-    uint16_t m_raygenStackSize;
     core::smart_refctd_dynamic_array<uint16_t> m_missStackSizes;
     core::smart_refctd_dynamic_array<SHitGroupStackSize> m_hitGroupStackSizes;
     core::smart_refctd_dynamic_array<uint16_t> m_callableStackSizes;
+    uint32_t m_missGroupCount;
+    uint32_t m_hitGroupCount;
+    uint32_t m_callableGroupCount;
+    uint16_t m_raygenStackSize;
 
     uint32_t getRaygenIndex() const;
     uint32_t getMissBaseIndex() const;
diff --git a/src/nbl/video/CVulkanComputePipeline.h b/src/nbl/video/CVulkanComputePipeline.h
index 76fb346e30..89077f9a9a 100644
--- a/src/nbl/video/CVulkanComputePipeline.h
+++ b/src/nbl/video/CVulkanComputePipeline.h
@@ -15,10 +15,9 @@ class CVulkanComputePipeline final : public IGPUComputePipeline
 {
     public:
         CVulkanComputePipeline(
-            core::smart_refctd_ptr<const IGPUPipelineLayout>&& _layout,
-            const core::bitflag<SCreationParams::FLAGS> _flags,
+            const SCreationParams& params,
             const VkPipeline pipeline
-        ) : IGPUComputePipeline(std::move(_layout),_flags), m_pipeline(pipeline) {}
+        ) : IGPUComputePipeline(params), m_pipeline(pipeline) {}
 
         inline const void* getNativeHandle() const override { return &m_pipeline; }
 
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 6050b7a7a0..216fefcef9 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1182,8 +1182,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
             // break the lifetime cause of the aliasing
             std::uninitialized_default_construct_n(output+i,1);
             output[i] = core::make_smart_refctd_ptr<CVulkanComputePipeline>(
-                core::smart_refctd_ptr<const IGPUPipelineLayout>(info.layout),
-                info.flags,vk_pipeline
+                info,vk_pipeline
             );
             debugNameBuilder.str("");
             const auto& specInfo = createInfos[i].shader;
diff --git a/src/nbl/video/CVulkanRayTracingPipeline.cpp b/src/nbl/video/CVulkanRayTracingPipeline.cpp
index a107d3bbed..960d78428a 100644
--- a/src/nbl/video/CVulkanRayTracingPipeline.cpp
+++ b/src/nbl/video/CVulkanRayTracingPipeline.cpp
@@ -15,17 +15,17 @@ namespace nbl::video
     ShaderGroupHandleContainer&& shaderGroupHandles) :
     IGPURayTracingPipeline(params),
     m_vkPipeline(vk_pipeline),
+    m_shaderGroupHandles(std::move(shaderGroupHandles)),
     m_missStackSizes(core::make_refctd_dynamic_array<GeneralGroupStackSizeContainer>(params.shaderGroups.misses.size())),
     m_hitGroupStackSizes(core::make_refctd_dynamic_array<HitGroupStackSizeContainer>(params.shaderGroups.hits.size())),
-    m_callableStackSizes(core::make_refctd_dynamic_array<GeneralGroupStackSizeContainer>(params.shaderGroups.hits.size())),
-    m_shaderGroupHandles(std::move(shaderGroupHandles))
+    m_callableStackSizes(core::make_refctd_dynamic_array<GeneralGroupStackSizeContainer>(params.shaderGroups.hits.size()))
   {
     const auto* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
     auto* vk = vulkanDevice->getFunctionTable();
 
-    auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, uint32_t shaderIx, VkShaderGroupShaderKHR shaderType) -> uint16_t
+    auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, const asset::IShader* shader, VkShaderGroupShaderKHR shaderType) -> uint16_t
     {
-      if (shaderIx == SShaderGroupsParams::SIndex::Unused)
+      if (shader == nullptr)
         return 0;
 
       return vk->vk.vkGetRayTracingShaderGroupStackSizeKHR(
@@ -36,14 +36,17 @@ namespace nbl::video
       );
     };
 
-    m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.index, VK_SHADER_GROUP_SHADER_GENERAL_KHR);
+    m_callableGroupCount = params.shaderGroups.callables.size();
+    m_missGroupCount = params.shaderGroups.misses.size();
+    m_hitGroupCount = params.shaderGroups.hits.size();
+    m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR);
 
     for (size_t shaderGroupIx = 0; shaderGroupIx < params.shaderGroups.misses.size(); shaderGroupIx++)
     {
       m_missStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize(
         getMissBaseIndex(), 
         shaderGroupIx, 
-        params.shaderGroups.misses[shaderGroupIx].index,
+        params.shaderGroups.misses[shaderGroupIx].shader,
         VK_SHADER_GROUP_SHADER_GENERAL_KHR);
     }
 
@@ -52,9 +55,9 @@ namespace nbl::video
       const auto& hitGroup = params.shaderGroups.hits[shaderGroupIx];
       const auto baseIndex = getHitBaseIndex();
       m_hitGroupStackSizes->operator[](shaderGroupIx) = SHitGroupStackSize{
-        .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR),
-        .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR),
-        .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR),
+        .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit.shader, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR),
+        .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit.shader,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR),
+        .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection.shader, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR),
       };
     }
 
@@ -63,7 +66,7 @@ namespace nbl::video
       m_callableStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize(
         getCallableBaseIndex(), 
         shaderGroupIx, 
-        params.shaderGroups.callables[shaderGroupIx].index,
+        params.shaderGroups.callables[shaderGroupIx].shader,
         VK_SHADER_GROUP_SHADER_GENERAL_KHR);
     }
   }
@@ -83,19 +86,19 @@ namespace nbl::video
   std::span<const IGPURayTracingPipeline::SShaderGroupHandle> CVulkanRayTracingPipeline::getMissHandles() const
   {
     const auto baseIndex = getMissBaseIndex();
-    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missShaderGroups->size());
+    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missGroupCount);
   }
 
   std::span<const IGPURayTracingPipeline::SShaderGroupHandle> CVulkanRayTracingPipeline::getHitHandles() const
   {
     const auto baseIndex = getHitBaseIndex();
-    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitShaderGroups->size());
+    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitGroupCount);
   }
 
   std::span<const IGPURayTracingPipeline::SShaderGroupHandle> CVulkanRayTracingPipeline::getCallableHandles() const
   {
     const auto baseIndex = getCallableBaseIndex();
-    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableShaderGroups->size());
+    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableGroupCount);
   }
 
   uint16_t CVulkanRayTracingPipeline::getRaygenStackSize() const
@@ -159,13 +162,13 @@ namespace nbl::video
   uint32_t CVulkanRayTracingPipeline::getHitBaseIndex() const
   {
     // one raygen group + miss groups before this groups
-    return 1 + m_missShaderGroups->size();
+    return 1 + m_missGroupCount;
   }
 
   uint32_t CVulkanRayTracingPipeline::getCallableBaseIndex() const
   {
     // one raygen group + miss groups + hit groups before this groups
-    return 1 + m_missShaderGroups->size() + m_hitShaderGroups->size();
+    return 1 + m_missGroupCount + m_hitGroupCount;
   }
 
 }
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index c019be84a7..0056cc3a2a 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -797,10 +797,8 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     }
 
     core::vector<IGPUComputePipeline::SCreationParams> newParams(params.begin(), params.end());
-    const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
-    {
-        return sum + param.getShaders().size();
-    });
+    const auto shaderCount = params.size();
+    
     core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
     debloatedShaders.reserve(shaderCount);
 

From f26201e29746ecbd7fb126a9fab50cccad70b6e6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 28 May 2025 16:48:39 +0700
Subject: [PATCH 241/346] Another fix to CCOmputeBlit

---
 src/nbl/video/utilities/CComputeBlit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index edac6e1f5c..a402df2137 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -66,7 +66,7 @@ struct ConstevalParameters
 	}();
 	auto createPipeline = [&limits,layout,&common](const char* mainPath)->smart_refctd_ptr<ICPUComputePipeline>
 	{
-		auto shader = make_smart_refctd_ptr<const IShader>(
+		auto shader = make_smart_refctd_ptr<IShader>(
 			(common+"\n#include \""+mainPath+"\"\n").c_str(),
 			IShader::E_CONTENT_TYPE::ECT_HLSL,
 			mainPath

From b18c83425b548aa5cd2fca4f7d5f80127a099766 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 12:02:19 +0200
Subject: [PATCH 242/346] pass NSC_IMAGE_NAME with ENV as it glitches when
 using CMake CLI due to . and / chars

---
 .github/workflows/build-nabla.yml | 4 ++--
 tools/nsc/CMakeLists.txt          | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index a7b0fe3e68..f93c0c270f 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -77,6 +77,7 @@ jobs:
             --entrypoint ${{ env.entry }} -di --isolation process `
             --env-file .\docker\ci-windows.env `
             --env-file .\docker\ninja.env `
+            --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" `
             --name orphan --network docker_default `
             -v "${{ github.workspace }}:${{ env.mount }}" `
             -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
@@ -100,8 +101,7 @@ jobs:
             ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               --profiling-output=profiling/cmake-profiling.json `
-              --profiling-format=google-trace `
-              "-DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}"
+              --profiling-format=google-trace
 
       - name: Container – Build NSC
         run: |
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index d3b8bdf94a..bcdcbca531 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -225,7 +225,9 @@ LABEL org.opencontainers.image.description="[Nabla Shader Compiler (NSC)]: @ORG_
 set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile")
 file(WRITE "${DOCKERFILE}" "${INSTRUCTIONS}")
 
-if(NOT DEFINED NSC_IMAGE_NAME)
+if(DEFINED ENV{NSC_IMAGE_NAME})
+  set(NSC_IMAGE_NAME "$ENV{NSC_IMAGE_NAME}")
+else()
   set(NSC_IMAGE_NAME nano/godbolt/nsc)
 endif()
 

From 7e6af2471090f2a6e19121e5570a3423d1ee0bcf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 15:21:30 +0200
Subject: [PATCH 243/346] upload NSC image to Github Container Registry on
 master push, create compose.yml for deploys

---
 .github/workflows/build-nabla.yml | 22 +++++++++++++++++++++-
 compose.yml                       | 16 ++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 compose.yml

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index f93c0c270f..e2ce30cc05 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -53,10 +53,15 @@ jobs:
           $repo = $env:GITHUB_REPOSITORY
           $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
           $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower()
+          $nscTargetTaggedImageLatest = "ghcr.io/${repo}:nsc-godbolt-latest".ToLower()
+
+          $shouldPushImage = ("${{ github.ref }}" -eq "refs/heads/master" -and "${{ matrix.vendor }}" -eq "msvc")
 
           "prefix=$prefix" >> $env:GITHUB_OUTPUT
           "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT
-    
+          "nscTargetTaggedImageLatest=$nscTargetTaggedImageLatest" >> $env:GITHUB_OUTPUT
+          "shouldPushImage=$shouldPushImage" >> $env:GITHUB_OUTPUT
+
       - name: Checkout
         uses: actions/checkout@v4
         with:
@@ -137,6 +142,7 @@ jobs:
         with:
           name: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image
           path: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst
+          compression-level: 0
 
       - name: Upload profiling artifacts
         uses: actions/upload-artifact@v4
@@ -149,3 +155,17 @@ jobs:
         with:
           name: ${{ steps.set-prefix.outputs.prefix }}-install
           path: ${{ steps.set-prefix.outputs.prefix }}-install.tar
+
+      - name: Login to GHCR
+        if: steps.set-prefix.outputs.shouldPushImage == 'True'
+        run: echo "${{ secrets.CR_PAT }}" | docker login ghcr.io -u $env:GITHUB_ACTOR --password-stdin
+
+      - name: Tag Latest image
+        if: steps.set-prefix.outputs.shouldPushImage == 'True'
+        run: |
+          docker tag ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }}
+
+      - name: Push images to GHCR
+        if: steps.set-prefix.outputs.shouldPushImage == 'True'
+        run: |
+          docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }}
\ No newline at end of file
diff --git a/compose.yml b/compose.yml
new file mode 100644
index 0000000000..8d6f1bc64a
--- /dev/null
+++ b/compose.yml
@@ -0,0 +1,16 @@
+services:
+  nsc:
+    container_name: nsc-godbolt
+    image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest
+    isolation: process
+    ports:
+      - "80:10240"
+    volumes:
+      - type: bind
+        source: C:\Windows\Globalization\ICU
+        target: C:\Windows\Globalization\ICU
+        read_only: true
+      - type: bind
+        source: C:\Windows\System32
+        target: C:\mount\Windows\System32
+        read_only: true
\ No newline at end of file

From e5b229ac6dd960fabfec8b83c8af8f5bdab41620 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 15:36:47 +0200
Subject: [PATCH 244/346] lock on push, update
 .github/workflows/build-nabla.yml

---
 .github/workflows/build-nabla.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index e2ce30cc05..9f1e203f1e 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -5,6 +5,10 @@ on:
   pull_request:
   workflow_dispatch:
 
+concurrency:
+  group: push-lock-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build-windows:
     runs-on: windows-2022

From 9328fd434a07e8ef24b382ba6a21dd37b671cb5d Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 28 May 2025 16:48:20 +0200
Subject: [PATCH 245/346] update shouldPushImage logic

---
 .github/workflows/build-nabla.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 9f1e203f1e..67fc9c4401 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -59,8 +59,14 @@ jobs:
           $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower()
           $nscTargetTaggedImageLatest = "ghcr.io/${repo}:nsc-godbolt-latest".ToLower()
 
-          $shouldPushImage = ("${{ github.ref }}" -eq "refs/heads/master" -and "${{ matrix.vendor }}" -eq "msvc")
+          $shouldPushImage = (
+            "${{ github.ref }}" -eq "refs/heads/master" -and
+            "${{ matrix.vendor }}" -eq "msvc" -and
+            "${{ matrix.config }}" -eq "Release"
+          )
 
+          Write-Host "::notice::Should push image? $shouldPushImage"
+          
           "prefix=$prefix" >> $env:GITHUB_OUTPUT
           "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT
           "nscTargetTaggedImageLatest=$nscTargetTaggedImageLatest" >> $env:GITHUB_OUTPUT

From 127c6d9593baa2dc950d9c76c80bf405ae6c76f2 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 29 May 2025 17:29:27 +0700
Subject: [PATCH 246/346] some fixes to indexing

---
 examples_tests                                |  2 +-
 .../hlsl/workgroup2/arithmetic_config.hlsl    |  6 ++++-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 25 ++++++++-----------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/examples_tests b/examples_tests
index 3d63ed7328..f202ef5632 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3d63ed732838c3073dfb7993d3eb1305fb5882be
+Subproject commit f202ef563249c172d4a6c699379c6793ae939863
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index c0e105e700..2f1a8b06a0 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -101,9 +101,13 @@ struct ArithmeticConfiguration
         return sharedStoreIndex<level>(virtualID);
     }
 
+    template<uint16_t level>
     static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component)
     {
-        return component * SubgroupSize + invocationIndex;
+        if (level == LevelCount-1)
+            return component * SubgroupSize + invocationIndex;
+        else
+            return component * __SubgroupsPerVirtualWorkgroup + invocationIndex;
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 195431c5d3..1d386835b9 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -124,7 +124,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
 
             if (Config::electLast())
@@ -183,15 +183,14 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv1_t lv1_val;
-            const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -253,11 +252,11 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(invocationIndex);
+                const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
                 scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -270,7 +269,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             if (Config::electLast())
                 scratchAccessor.template set<scalar_t, uint32_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
@@ -309,15 +308,14 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         if (glsl::gl_SubgroupID() < Config::SubgroupsSize*Config::ItemsPerInvocation_2)
         {
             vector_lv1_t lv1_val;
-            const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]);
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             if (Config::electLast())
             {
                 const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
@@ -331,15 +329,14 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv2_t lv2_val;
-            const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i)-1,lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i)-1,lv2_val[i]);
             lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -357,7 +354,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::sharedLoadIndex(invocationIndex, i), binop(lv1_val[i],lv2_scan));
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan));
         }
 
         // combine with level 0

From 52c7db99f99c3f349eb29675941184c606ff7269 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Thu, 29 May 2025 14:05:05 +0200
Subject: [PATCH 247/346] Update tools/nsc/docker/README.md

---
 tools/nsc/docker/README.md | 87 +++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 7 deletions(-)

diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md
index 21f8f4e06d..a18fe48c1f 100644
--- a/tools/nsc/docker/README.md
+++ b/tools/nsc/docker/README.md
@@ -1,16 +1,89 @@
-# NSC Docker Godbolt
+# NSC & Godbolt integration
 
-## Run NSC tool straight from build directory in compiler explorer docker container!
+## Run Compiler Explorer with NSC tool in docker container!
 
-Currently only Windows platform with target *x86_64* architecture is supported. Tested with Hyper-V isolation mode.
+https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263
 
-### Requirements
+## Requirements
 
-- [***Docker Desktop***](https://www.docker.com/products/docker-desktop/)
+- Configured [***Docker***](https://docs.docker.com/desktop/setup/install/windows-install/) for Windows Containers
+- [Windows, Windows Server Core or Windows Server](<https://learn.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/container-base-images>) with **minumum** x86_64 10.0.20348 build (2022 distributions)
 
-### How To
+> [!TIP]
+> type `cmd /ver` to see your build version
 
-Switch docker to windows containers, configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` option (recommended Visual Studio generator) & build `run-compiler-explorer` target. After the build completes type `localhost` in your browser.
+> [!CAUTION]  
+> Hyper-V is **NOT** supported, you must run NSC Godbolt container as process
+
+## How to run image
+
+> [!IMPORTANT]  
+> If using Docker Desktop - first make sure you have switched to `Containers for Windows`, see image bellow. If you are CLI user and have client & daemon headless then use appropriate windows build context.
 
 ![Containers for Windows](https://user-images.githubusercontent.com/65064509/152947300-affca592-35a7-4e4c-a7fc-2055ce1ba528.png)
 
+> [!CAUTION]  
+> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise will fail, see the compose file for more details
+
+### from container registry
+
+execute
+
+```powershell
+curl -L https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/master/compose.yml | docker compose -f - up
+```
+
+or in Nabla checkout
+
+```powershell
+docker compose up
+```
+
+and type `localhost` in your browser.
+
+### from Nabla pipeline workflow artifacts
+
+> [!NOTE]
+> We publish container images to the GitHub Container Registry that include **only the Release variant** of NSC executables built with **MSVC**.  
+> However, our CI pipelines **build and test all configurations**. Compressed images for each configuration are uploaded as **workflow artifacts**.
+> Look for artifacts named:  
+> `<prefix>-msvc-<config>-nsc-godbolt-image`
+
+> [!NOTE]
+> To decompress image artifact you need [zstd](<https://github.com/facebook/zstd/releases>)
+
+Download workflow image artifact, unzip and
+
+```powershell
+zstd -d < <prefix>-msvc-<config>-nsc-godbolt-image.tar.zst | docker load
+```
+
+<details>
+<summary>Docker load example (click to expand)</summary>
+
+```  
+C:\Users\anastaziuk\Desktop\DevshGraphicsProgramming\Nabla\tools\nsc\docker>zstd -d < run-windows-17.13.6-msvc-Debug-nsc-godbolt-image.tar.zst | docker load
+b2ebf78c3627: Loading layer [==================================================>]  3.149MB/3.149MB
+4c201e14cc01: Loading layer [==================================================>]   77.4MB/77.4MB
+68a216251b8f: Loading layer [==================================================>]  61.95kB/61.95kB
+7a4e13ca4c4e: Loading layer [==================================================>]  52.74kB/52.74kB
+634001f55b21: Loading layer [==================================================>]  52.74kB/52.74kB
+6a609178bb9a: Loading layer [==================================================>]  52.74kB/52.74kB
+3d7afb042308: Loading layer [==================================================>]  52.74kB/52.74kB
+ca034d7bc58a: Loading layer [==================================================>]  52.74kB/52.74kB
+55b4134a1ae9: Loading layer [==================================================>]  52.74kB/52.74kB
+0648adff3faa: Loading layer [==================================================>]  52.74kB/52.74kB
+Loaded image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6
+```
+
+</details>
+
+copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) and execute
+
+```
+docker compose -f override-compose.yml up
+```
+
+## How to build image
+
+Configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` and build `run-compiler-explorer` target.

From 531784f6daa447884f7d155c9f2dffc4f0abb85e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Thu, 29 May 2025 14:08:59 +0200
Subject: [PATCH 248/346] post tools/nsc/docker/README.md updates

---
 tools/nsc/docker/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md
index a18fe48c1f..afd8b0f8b7 100644
--- a/tools/nsc/docker/README.md
+++ b/tools/nsc/docker/README.md
@@ -23,7 +23,7 @@ https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263
 ![Containers for Windows](https://user-images.githubusercontent.com/65064509/152947300-affca592-35a7-4e4c-a7fc-2055ce1ba528.png)
 
 > [!CAUTION]  
-> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise will fail, see the compose file for more details
+> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise it will fail in runtime, see the [compose](<https://github.com/Devsh-Graphics-Programming/Nabla/blob/master/compose.yml#L6>) file for more details
 
 ### from container registry
 
@@ -78,12 +78,14 @@ Loaded image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-de
 
 </details>
 
-copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) and execute
+copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) then execute
 
 ```
 docker compose -f override-compose.yml up
 ```
 
+and type `localhost` in your browser.
+
 ## How to build image
 
 Configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` and build `run-compiler-explorer` target.

From edac59f31c8edfdafd05db4e7961bb5c14435713 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 29 May 2025 20:44:35 +0700
Subject: [PATCH 249/346] Fix AssetConvert to use the current SpecInfo

---
 src/nbl/video/utilities/CAssetConverter.cpp | 75 +++++++++++----------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index d1615a4637..ce46d5a9a8 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -519,8 +519,8 @@ class AssetVisitor : public CRTP
 			if (!layout || !descend(layout,{layout}))
 				return false;
 			const auto& specInfo = asset->getSpecInfo();
-			const auto* shader = specInfo.shader;
-			if (!shader || !descend(shader,{shader},specInfo))
+			const auto* shader = specInfo.shader.get();
+			if (!shader || !descend(shader,{shader},specInfo, hlsl::ESS_COMPUTE))
 				return false;
 			return true;
 		}
@@ -536,8 +536,8 @@ class AssetVisitor : public CRTP
 			using stage_t = hlsl::ShaderStage;
 			for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT})
 			{
-				const auto& specInfo = asset->getSpecInfo(stage);
-				const auto* shader = specInfo.shader;
+				const auto& specInfo = asset->getSpecInfos(stage);
+				const auto* shader = specInfo[0].shader.get();
 				if (!shader)
 				{
 					if (stage==stage_t::ESS_VERTEX) // required
@@ -545,7 +545,7 @@ class AssetVisitor : public CRTP
 					CRTP::template nullOptional<IShader>();
 					continue;
 				}
-				if (!descend(shader,{shader},specInfo))
+				if (!descend(shader,{shader},specInfo[0], stage))
 					return false;
 			}
 			return true;
@@ -1035,25 +1035,18 @@ class HashVisit : public CAssetConverter::CHashCache::hash_impl_base
 				auto argTuple = std::tuple<const ExtraArgs&...>(extraArgs...);
 				const auto& arg0 = std::get<0>(argTuple);
 				// hash the spec info
-				if constexpr (std::is_same_v<decltype(arg0),const IPipelineBase::SShaderSpecInfo&>)
+				if constexpr (std::is_same_v<decltype(arg0),const ICPUPipelineBase::SShaderSpecInfo&>)
 				{
+					const auto stage = std::get<1>(argTuple);
 					hasher << arg0.entryPoint;
-					hasher << arg0.stage;
+					hasher << stage;
 					hasher << arg0.requiredSubgroupSize;
-					switch (arg0.stage)
+					if (!arg0.entries.empty())
 					{
-						case hlsl::ShaderStage::ESS_COMPUTE:
-							hasher << arg0.requireFullSubgroups;
-							break;
-						default:
-							break;
-					}
-					if (arg0.entries)
-					{
-					  for (const auto& specConstant : *arg0.entries) 
+					  for (const auto& specConstant : arg0.entries) 
 					  {
 							hasher << specConstant.first;
-					    hasher.update(specConstant.second.data, specConstant.second.size);
+					    hasher.update(specConstant.second.data(), specConstant.second.size());
 					  }
 					}
 				}
@@ -1303,6 +1296,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUComputePipe
 	};
 	if (!visitor())
 		return false;
+	const auto& params = asset->getCachedCreationParams();
+	hasher << params.requireFullSubgroups;
 	return true;
 }
 bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPURenderpass> lookup)
@@ -1718,16 +1713,14 @@ class GetDependantVisit<ICPUComputePipeline> : public GetDependantVisitBase<ICPU
 	public:
 //		using AssetType = ICPUComputePipeline;
 
-		inline auto& getSpecInfo(const IShader::E_SHADER_STAGE stage)
+		inline auto& getSpecInfo()
 		{
-			assert(hlsl::bitCount(stage)==1);
-			return specInfo[hlsl::findLSB(stage)];
+			return specInfo;
 		}
 
 		// ok to do non owning since some cache owns anyway
 		IGPUPipelineLayout* layout = nullptr;
-		// has to be public to allow for initializer list constructor
-		std::array<IPipelineBase::SShaderSpecInfo,/*hlsl::mpl::findMSB<ESS_COUNT>::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {};
+		ICPUPipelineBase::SShaderSpecInfo specInfo = {};
 
 	protected:
 		bool descend_impl(
@@ -1743,18 +1736,16 @@ class GetDependantVisit<ICPUComputePipeline> : public GetDependantVisitBase<ICPU
 		}
 		bool descend_impl(
 			const instance_t<ICPUComputePipeline>& user, const CAssetConverter::patch_t<ICPUComputePipeline>& userPatch,
-			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo
+			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage
 		)
 		{
 			auto depObj = getDependant<IShader>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			getSpecInfo(inSpecInfo.stage) = {
-				.shader = depObj.get(),
+			getSpecInfo() = ICPUPipelineBase::SShaderSpecInfo{
+				.shader = depObj,
 				.entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now!
-				.stage = inSpecInfo.stage,
 				.requiredSubgroupSize = inSpecInfo.requiredSubgroupSize,
-				.requireFullSubgroups = inSpecInfo.requireFullSubgroups,
 				.entries = inSpecInfo.entries
 			};
 			return true;
@@ -1775,7 +1766,7 @@ class GetDependantVisit<ICPUGraphicsPipeline> : public GetDependantVisitBase<ICP
 		// ok to do non owning since some cache owns anyway
 		IGPUPipelineLayout* layout = nullptr;
 		// has to be public to allow for initializer list constructor
-		std::array<IPipelineBase::SShaderSpecInfo,/*hlsl::mpl::findMSB<ESS_COUNT>::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {};
+		std::array<ICPUPipelineBase::SShaderSpecInfo,/*hlsl::mpl::findMSB<ESS_COUNT>::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {};
 		// optionals (done this way because inheritance chain with templated class hides protected methods)
 		IGPURenderpass* renderpass = nullptr;
 
@@ -1793,18 +1784,16 @@ class GetDependantVisit<ICPUGraphicsPipeline> : public GetDependantVisitBase<ICP
 		}
 		bool descend_impl(
 			const instance_t<ICPUGraphicsPipeline>& user, const CAssetConverter::patch_t<ICPUGraphicsPipeline>& userPatch,
-			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo
+			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage
 		)
 		{
 			auto depObj = getDependant<IShader>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			getSpecInfo(inSpecInfo.stage) = {
-				.shader = depObj.get(),
+			getSpecInfo(stage) = {
+				.shader = depObj,
 				.entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now!
-				.stage = inSpecInfo.stage,
 				.requiredSubgroupSize = inSpecInfo.requiredSubgroupSize,
-				.requireFullSubgroups = 0,
 				.entries = inSpecInfo.entries
 			};
 			return true;
@@ -3120,12 +3109,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							continue;
 						// ILogicalDevice::createComputePipelines is rather aggressive on the spec constant validation, so we create one pipeline at a time
 						core::smart_refctd_ptr<IGPUComputePipeline> ppln;
+						IGPUPipelineBase::SShaderEntryMap entryMap;
 						{
 							// no derivatives, special flags, etc.
 							IGPUComputePipeline::SCreationParams params = {};
 							params.layout = visitor.layout;
 							// while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to COMPUTE
-							params.shader = visitor.getSpecInfo(IShader::E_SHADER_STAGE::ESS_COMPUTE);
+							params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), entryMap);
 							device->createComputePipelines(inputs.pipelineCache,{&params,1},&ppln);
 						}
 						assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
@@ -3148,7 +3138,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUGraphicsPipeline>)
 			{
-				core::vector<IPipelineBase::SShaderSpecInfo> tmpSpecInfo;
+				core::vector<ICPUPipelineBase::SShaderSpecInfo> tmpSpecInfo;
 				tmpSpecInfo.reserve(5);
 				for (auto& entry : conversionRequests)
 				{
@@ -3170,6 +3160,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						{
 							// no derivatives, special flags, etc.
 							IGPUGraphicsPipeline::SCreationParams params = {};
+							using SShaderEntryMap = IGPUPipelineBase::SShaderEntryMap;
+							SShaderEntryMap vertexEntryMap;
+							SShaderEntryMap tesselationControlEntryMap;
+							SShaderEntryMap tesselationEvaluationEntryMap;
+							SShaderEntryMap geometryEntryMap;
+							SShaderEntryMap fragmentEntryMap;
 							bool depNotFound = false;
 							{
 								params.layout = visitor.layout;
@@ -3183,7 +3179,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 									if (info.shader)
 										tmpSpecInfo.push_back(std::move(info));
 								}
-								params.shaders = tmpSpecInfo;
+                using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo;
+								params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), vertexEntryMap);
+								params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), tesselationControlEntryMap);
+								params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), tesselationEvaluationEntryMap);
+								params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), geometryEntryMap);
+								params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), fragmentEntryMap);
 							}
 							params.cached = asset->getCachedCreationParams();
 							device->createGraphicsPipelines(inputs.pipelineCache,{&params,1},&ppln);

From a31cc66ddf18268a14c82e8410cf72ff95e161b9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 29 May 2025 20:45:33 +0700
Subject: [PATCH 250/346] Small fixes to asset and video

---
 include/nbl/asset/ICPUComputePipeline.h       | 14 +++++++++++-
 include/nbl/asset/ICPUGraphicsPipeline.h      |  4 ++--
 include/nbl/asset/ICPUPipeline.h              |  6 ++---
 .../asset/ICPURenderpassIndependentPipeline.h |  4 ++--
 include/nbl/asset/IComputePipeline.h          |  1 -
 include/nbl/video/IGPUPipeline.h              | 22 +++++++++++++++++--
 6 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index b940c2ae48..69bffe2bba 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -39,13 +39,25 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
             return computeDependantsImpl(this);
         }
 
-        inline std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override
+        inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override
         {
             if (stage==hlsl::ShaderStage::ESS_COMPUTE)
                 return {&m_specInfo,1};
             return {};
         }
 
+        inline SShaderSpecInfo& getSpecInfo()
+        {
+            return m_specInfo;
+        }
+
+        inline const SShaderSpecInfo& getSpecInfo() const
+        {
+            return m_specInfo;
+        }
+
+        inline SCachedCreationParams& getCachedCreationParamsMut() { return m_params; }
+
         inline bool valid() const override
         {
             if (!m_layout) return false;
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 4a1520880d..a17bebe87d 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -39,13 +39,13 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return computeDependantsImpl(this);
         }
 
-        inline SCachedCreationParams& getCachedCreationParams()
+        inline SCachedCreationParams& getCachedCreationParamsMut()
         {
             assert(isMutable());
             return m_params;
         }
 
-        inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
+        inline virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
         {
             const auto stageIndex = stageToIndex(stage);
             if (stageIndex != -1)
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 069c9fc35e..0642acb676 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -93,7 +93,7 @@ class ICPUPipelineBase
             }
         };
 
-        virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const = 0;
+        virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const = 0;
 
 };
 
@@ -131,11 +131,11 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
             return clone_impl(std::move(layout), _depth);
         }
 
-        // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfo. It always use the const variant. Will check on it later.
+        // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later.
         inline std::span<SShaderSpecInfo> getSpecInfoMut(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return {};
-            const auto specInfo = const_cast<const this_t*>(this)->getSpecInfo(stage);
+            const auto specInfo = const_cast<const this_t*>(this)->getSpecInfos(stage);
             return { const_cast<SShaderSpecInfo*>(specInfo.data()), specInfo.size() };
         }
 
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index fbff6ee312..83536e0c54 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -105,7 +105,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 
 #if 0
 		// The getters are weird because the shader pointer needs patching
-		inline IShader::SSpecInfo<ICPUShader> getSpecInfo(const hlsl::ShaderStage stage)
+		inline IShader::SSpecInfo<ICPUShader> getSpecInfos(const hlsl::ShaderStage stage)
 		{
 			assert(isMutable());
 			const auto stageIx = hlsl::findLSB(stage);
@@ -113,7 +113,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 				return {};
 			return m_infos[stageIx];
 		}
-		inline IShader::SSpecInfo<const ICPUShader> getSpecInfo(const hlsl::ShaderStage stage) const
+		inline IShader::SSpecInfo<const ICPUShader> getSpecInfos(const hlsl::ShaderStage stage) const
 		{
 			const auto stageIx = hlsl::findLSB(stage);
 			if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h
index 2cb38b39f1..ba4d245473 100644
--- a/include/nbl/asset/IComputePipeline.h
+++ b/include/nbl/asset/IComputePipeline.h
@@ -24,7 +24,6 @@ class IComputePipeline : public IPipeline<PipelineLayoutType>, public IComputePi
   public:
 
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
-    inline SCachedCreationParams& getCachedCreationParams() { return m_params; }
 
   protected:
     explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index f2e9b79fef..0b56b87ee9 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -8,6 +8,7 @@
 
 #include "nbl/video/IGPUPipelineLayout.h"
 #include "nbl/video/SPipelineCreationParams.h"
+#include "nbl/asset/ICPUPipeline.h"
 #include "nbl/asset/IPipeline.h"
 
 namespace nbl::video
@@ -17,6 +18,7 @@ class IGPUPipelineBase {
     public:
         struct SShaderSpecInfo
         {
+
             //! Structure specifying a specialization map entry
             /*
               Note that if specialization constant ID is used
@@ -93,18 +95,34 @@ class IGPUPipelineBase {
 
             asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
 
-
             // Container choice implicitly satisfies:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
-            const core::unordered_map<spec_constant_id_t, SSpecConstantValue>* entries;
+            using entry_map_t = core::unordered_map<spec_constant_id_t, SSpecConstantValue>;
+            const entry_map_t* entries;
             // By requiring Nabla Core Profile features we implicitly satisfy:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
             // Also because our API is sane, it satisfies the following by construction:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
 
+
+            static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t& outEntries)  
+            {
+                SShaderSpecInfo specInfo;
+                specInfo.shader = cpuSpecInfo.shader.get();
+                specInfo.entryPoint = cpuSpecInfo.entryPoint;
+                specInfo.requiredSubgroupSize = cpuSpecInfo.requiredSubgroupSize;
+                for (const auto&[key, value] : cpuSpecInfo.entries)
+                {
+                    outEntries.insert({ key, { value.data(), value.size() } });
+                }
+                specInfo.entries = &outEntries;
+                return specInfo;
+            };
         };
 
+        using SShaderEntryMap = SShaderSpecInfo::entry_map_t;
+
 };
 
 // Common Base class for pipelines

From 08ece5d33d07391a85be35905d508dc2359efb6e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 29 May 2025 20:49:19 +0700
Subject: [PATCH 251/346] Fix CComputeBlit

---
 src/nbl/video/utilities/CComputeBlit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index a402df2137..ade127b790 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -83,7 +83,7 @@ struct ConstevalParameters
 			.entryPoint = "main",
 			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(findMSB(limits.maxSubgroupSize)),
 		};
-		pipeline->getCachedCreationParams() = {
+		pipeline->getCachedCreationParamsMut() = {
 			.requireFullSubgroups = true,
 		};
 		return pipeline;

From 75530d4bc6e3297613a841f2d2b7929814a6d720 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Thu, 29 May 2025 19:58:08 +0200
Subject: [PATCH 252/346] Create run-nsc.yml workflow

---
 .github/workflows /run-nsc.yml | 206 +++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 .github/workflows /run-nsc.yml

diff --git a/.github/workflows /run-nsc.yml b/.github/workflows /run-nsc.yml
new file mode 100644
index 0000000000..07be0d44e9
--- /dev/null
+++ b/.github/workflows /run-nsc.yml	
@@ -0,0 +1,206 @@
+name: Run NSC Godbolt Container
+
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: "The id of the workflow run where the desired download artifact was uploaded from"
+        required: true
+      build_config:
+        description: "Build configuration (Release / RelWithDebInfo / Debug)"
+        required: true
+        default: "Release"
+        type: choice
+        options:
+          - Release
+          - RelWithDebInfo
+          - Debug
+      withDiscordMSG:
+        description: "Send Discord message after tunnel is up"
+        required: true
+        default: true
+        type: boolean
+
+jobs:
+  run-container:
+    runs-on: windows-2022
+    env:
+      DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }}
+
+    steps:
+      - name: Environment Setup
+        run: |
+            Add-MpPreference -ExclusionPath "${{ github.workspace }}"
+            Add-MpPreference -ExclusionExtension "*.*"
+            Add-MpPreference -ExclusionProcess "docker.exe"
+            Add-MpPreference -ExclusionProcess "dockerd.exe"
+            Set-MpPreference -RemediationScheduleDay 8
+            Set-MpPreference -DisableRealtimeMonitoring $true
+            Set-MpPreference -DisableRemovableDriveScanning $true
+            Set-MpPreference -DisableArchiveScanning $true
+            Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
+
+            if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
+              docker network create --driver nat docker_default
+              if ($LASTEXITCODE -ne 0) { exit 1 }
+            }
+    
+      - name: Download NSC Godbolt artifact
+        uses: actions/download-artifact@v4
+        with:
+          run-id: ${{ inputs.run_id }}
+          pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image
+          path: artifact
+          merge-multiple: true
+          github-token: ${{ secrets.READ_PAT }}
+          repository: Devsh-Graphics-Programming/Nabla
+
+      - name: Decompress .tar.zst
+        run: |
+          Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object {
+            $output = $_.FullName -replace '\.zst$', ''
+            zstd -d "$($_.FullName)" -o "$output"
+          }
+
+      - name: Load Docker image
+        run: |
+          $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1
+          docker load -i $image.FullName
+
+      - name: Generate and run Docker Compose with matched image
+        run: |
+          $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" |
+            Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } |
+            Select-Object -First 1
+
+          if (-not $imageName) {
+            Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*"
+            exit 1
+          }
+
+          Write-Host "Found image: $imageName"
+
+          @"
+          services:
+            nsc:
+              container_name: nsc-godbolt
+              image: $imageName
+              isolation: process
+              ports:
+                - "10240:10240"
+              volumes:
+                - type: bind
+                  source: C:\Windows\Globalization\ICU
+                  target: C:\Windows\Globalization\ICU
+                  read_only: true
+                - type: bind
+                  source: C:\Windows\System32
+                  target: C:\mount\Windows\System32
+                  read_only: true
+              networks:
+                - docker_default
+                
+          networks:
+            docker_default:
+              external: true
+          "@ | Set-Content compose.generated.yml
+
+          docker compose -f compose.generated.yml up -d
+
+      - name: Wait for local server on port 10240
+        run: |
+          $maxRetries = 24
+          $retryDelay = 5
+          $success = $false
+      
+          for ($i = 0; $i -lt $maxRetries; $i++) {
+            try {
+              $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5
+              if ($response.StatusCode -eq 200) {
+                Write-Host "Local server is up and responding."
+                $success = $true
+                break
+              } else {
+                Write-Host "Received HTTP $($response.StatusCode), retrying..."
+              }
+            } catch {
+              Write-Host "Local server not responding yet, retrying..."
+            }
+            Start-Sleep -Seconds $retryDelay
+          }
+      
+          if (-not $success) {
+            Write-Error "Local server on port 10240 did not respond within timeout."
+            exit 1
+          }
+          
+      - name: Print Container Logs
+        run: |
+          docker logs nsc-godbolt
+
+      - name: Download cloudflared
+        run: |
+          Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe
+
+      - name: Start tunnel
+        run: |          
+          Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log"
+          
+          $tries = 60
+          $url = $null
+          
+          while ($tries -gt 0) {
+              if (Test-Path cf.log) {
+                  $log = Get-Content cf.log
+                  foreach ($line in $log) {
+                      if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') {
+                          $url = $Matches[0]
+                          Write-Host "::notice title=Tunnel URL::$url"
+                          break
+                      }
+                  }
+                  if ($url) { break }
+              }
+              Start-Sleep -Seconds 1
+              $tries -= 1
+          }
+          
+          if (-not $url) {
+              Write-Error "Could not get tunnel URL from cloudflared log"
+              exit 1
+          }
+
+          $webhookUrl = "$env:DISCORD_WEBHOOK"
+          $runId = "${{ inputs.run_id }}"
+          $actor = "$env:GITHUB_ACTOR"
+          $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss")
+          $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId"
+          $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true"
+
+          $description = @"
+          - tunnel opened for 5 hours, click [here](<$url>) to connect
+          - workflow [logs #${{ github.run_id }}](<$workflowRunURL>)
+          - image downloaded from [run #$runId](<$composedURL>)
+          - dispatched by $actor
+          "@
+
+          $payload = @{
+              embeds = @(
+                  @{
+                      title = "Running NSC Godbolt Container"
+                      description = $description
+                      color = 15844367
+                      footer = @{
+                          text = "sent from GitHub Actions runner"
+                      }
+                      timestamp = (Get-Date).ToString("o")
+                  }
+              )
+          } | ConvertTo-Json -Depth 10
+          
+          if ($sendDiscord) {
+              Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload
+          }
+          
+          Start-Sleep -Seconds 18000

From 4ec5bac4273992780aec843975b90bd77c1b8a5f Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Thu, 29 May 2025 20:04:44 +0200
Subject: [PATCH 253/346] Delete .github/workflows  directory

---
 .github/workflows /run-nsc.yml | 206 ---------------------------------
 1 file changed, 206 deletions(-)
 delete mode 100644 .github/workflows /run-nsc.yml

diff --git a/.github/workflows /run-nsc.yml b/.github/workflows /run-nsc.yml
deleted file mode 100644
index 07be0d44e9..0000000000
--- a/.github/workflows /run-nsc.yml	
+++ /dev/null
@@ -1,206 +0,0 @@
-name: Run NSC Godbolt Container
-
-on:
-  workflow_dispatch:
-    inputs:
-      run_id:
-        description: "The id of the workflow run where the desired download artifact was uploaded from"
-        required: true
-      build_config:
-        description: "Build configuration (Release / RelWithDebInfo / Debug)"
-        required: true
-        default: "Release"
-        type: choice
-        options:
-          - Release
-          - RelWithDebInfo
-          - Debug
-      withDiscordMSG:
-        description: "Send Discord message after tunnel is up"
-        required: true
-        default: true
-        type: boolean
-
-jobs:
-  run-container:
-    runs-on: windows-2022
-    env:
-      DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }}
-
-    steps:
-      - name: Environment Setup
-        run: |
-            Add-MpPreference -ExclusionPath "${{ github.workspace }}"
-            Add-MpPreference -ExclusionExtension "*.*"
-            Add-MpPreference -ExclusionProcess "docker.exe"
-            Add-MpPreference -ExclusionProcess "dockerd.exe"
-            Set-MpPreference -RemediationScheduleDay 8
-            Set-MpPreference -DisableRealtimeMonitoring $true
-            Set-MpPreference -DisableRemovableDriveScanning $true
-            Set-MpPreference -DisableArchiveScanning $true
-            Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
-
-            if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
-              docker network create --driver nat docker_default
-              if ($LASTEXITCODE -ne 0) { exit 1 }
-            }
-    
-      - name: Download NSC Godbolt artifact
-        uses: actions/download-artifact@v4
-        with:
-          run-id: ${{ inputs.run_id }}
-          pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image
-          path: artifact
-          merge-multiple: true
-          github-token: ${{ secrets.READ_PAT }}
-          repository: Devsh-Graphics-Programming/Nabla
-
-      - name: Decompress .tar.zst
-        run: |
-          Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object {
-            $output = $_.FullName -replace '\.zst$', ''
-            zstd -d "$($_.FullName)" -o "$output"
-          }
-
-      - name: Load Docker image
-        run: |
-          $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1
-          docker load -i $image.FullName
-
-      - name: Generate and run Docker Compose with matched image
-        run: |
-          $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" |
-            Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } |
-            Select-Object -First 1
-
-          if (-not $imageName) {
-            Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*"
-            exit 1
-          }
-
-          Write-Host "Found image: $imageName"
-
-          @"
-          services:
-            nsc:
-              container_name: nsc-godbolt
-              image: $imageName
-              isolation: process
-              ports:
-                - "10240:10240"
-              volumes:
-                - type: bind
-                  source: C:\Windows\Globalization\ICU
-                  target: C:\Windows\Globalization\ICU
-                  read_only: true
-                - type: bind
-                  source: C:\Windows\System32
-                  target: C:\mount\Windows\System32
-                  read_only: true
-              networks:
-                - docker_default
-                
-          networks:
-            docker_default:
-              external: true
-          "@ | Set-Content compose.generated.yml
-
-          docker compose -f compose.generated.yml up -d
-
-      - name: Wait for local server on port 10240
-        run: |
-          $maxRetries = 24
-          $retryDelay = 5
-          $success = $false
-      
-          for ($i = 0; $i -lt $maxRetries; $i++) {
-            try {
-              $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5
-              if ($response.StatusCode -eq 200) {
-                Write-Host "Local server is up and responding."
-                $success = $true
-                break
-              } else {
-                Write-Host "Received HTTP $($response.StatusCode), retrying..."
-              }
-            } catch {
-              Write-Host "Local server not responding yet, retrying..."
-            }
-            Start-Sleep -Seconds $retryDelay
-          }
-      
-          if (-not $success) {
-            Write-Error "Local server on port 10240 did not respond within timeout."
-            exit 1
-          }
-          
-      - name: Print Container Logs
-        run: |
-          docker logs nsc-godbolt
-
-      - name: Download cloudflared
-        run: |
-          Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe
-
-      - name: Start tunnel
-        run: |          
-          Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log"
-          
-          $tries = 60
-          $url = $null
-          
-          while ($tries -gt 0) {
-              if (Test-Path cf.log) {
-                  $log = Get-Content cf.log
-                  foreach ($line in $log) {
-                      if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') {
-                          $url = $Matches[0]
-                          Write-Host "::notice title=Tunnel URL::$url"
-                          break
-                      }
-                  }
-                  if ($url) { break }
-              }
-              Start-Sleep -Seconds 1
-              $tries -= 1
-          }
-          
-          if (-not $url) {
-              Write-Error "Could not get tunnel URL from cloudflared log"
-              exit 1
-          }
-
-          $webhookUrl = "$env:DISCORD_WEBHOOK"
-          $runId = "${{ inputs.run_id }}"
-          $actor = "$env:GITHUB_ACTOR"
-          $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss")
-          $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId"
-          $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true"
-
-          $description = @"
-          - tunnel opened for 5 hours, click [here](<$url>) to connect
-          - workflow [logs #${{ github.run_id }}](<$workflowRunURL>)
-          - image downloaded from [run #$runId](<$composedURL>)
-          - dispatched by $actor
-          "@
-
-          $payload = @{
-              embeds = @(
-                  @{
-                      title = "Running NSC Godbolt Container"
-                      description = $description
-                      color = 15844367
-                      footer = @{
-                          text = "sent from GitHub Actions runner"
-                      }
-                      timestamp = (Get-Date).ToString("o")
-                  }
-              )
-          } | ConvertTo-Json -Depth 10
-          
-          if ($sendDiscord) {
-              Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload
-          }
-          
-          Start-Sleep -Seconds 18000

From 104422f7752b4b49d9e2f938f64d52c1528337a9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Thu, 29 May 2025 20:05:10 +0200
Subject: [PATCH 254/346] Create run-nsc.yml

---
 .github/workflows/run-nsc.yml | 206 ++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 .github/workflows/run-nsc.yml

diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml
new file mode 100644
index 0000000000..07be0d44e9
--- /dev/null
+++ b/.github/workflows/run-nsc.yml
@@ -0,0 +1,206 @@
+name: Run NSC Godbolt Container
+
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: "The id of the workflow run where the desired download artifact was uploaded from"
+        required: true
+      build_config:
+        description: "Build configuration (Release / RelWithDebInfo / Debug)"
+        required: true
+        default: "Release"
+        type: choice
+        options:
+          - Release
+          - RelWithDebInfo
+          - Debug
+      withDiscordMSG:
+        description: "Send Discord message after tunnel is up"
+        required: true
+        default: true
+        type: boolean
+
+jobs:
+  run-container:
+    runs-on: windows-2022
+    env:
+      DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }}
+
+    steps:
+      - name: Environment Setup
+        run: |
+            Add-MpPreference -ExclusionPath "${{ github.workspace }}"
+            Add-MpPreference -ExclusionExtension "*.*"
+            Add-MpPreference -ExclusionProcess "docker.exe"
+            Add-MpPreference -ExclusionProcess "dockerd.exe"
+            Set-MpPreference -RemediationScheduleDay 8
+            Set-MpPreference -DisableRealtimeMonitoring $true
+            Set-MpPreference -DisableRemovableDriveScanning $true
+            Set-MpPreference -DisableArchiveScanning $true
+            Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
+
+            if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
+              docker network create --driver nat docker_default
+              if ($LASTEXITCODE -ne 0) { exit 1 }
+            }
+    
+      - name: Download NSC Godbolt artifact
+        uses: actions/download-artifact@v4
+        with:
+          run-id: ${{ inputs.run_id }}
+          pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image
+          path: artifact
+          merge-multiple: true
+          github-token: ${{ secrets.READ_PAT }}
+          repository: Devsh-Graphics-Programming/Nabla
+
+      - name: Decompress .tar.zst
+        run: |
+          Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object {
+            $output = $_.FullName -replace '\.zst$', ''
+            zstd -d "$($_.FullName)" -o "$output"
+          }
+
+      - name: Load Docker image
+        run: |
+          $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1
+          docker load -i $image.FullName
+
+      - name: Generate and run Docker Compose with matched image
+        run: |
+          $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" |
+            Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } |
+            Select-Object -First 1
+
+          if (-not $imageName) {
+            Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*"
+            exit 1
+          }
+
+          Write-Host "Found image: $imageName"
+
+          @"
+          services:
+            nsc:
+              container_name: nsc-godbolt
+              image: $imageName
+              isolation: process
+              ports:
+                - "10240:10240"
+              volumes:
+                - type: bind
+                  source: C:\Windows\Globalization\ICU
+                  target: C:\Windows\Globalization\ICU
+                  read_only: true
+                - type: bind
+                  source: C:\Windows\System32
+                  target: C:\mount\Windows\System32
+                  read_only: true
+              networks:
+                - docker_default
+                
+          networks:
+            docker_default:
+              external: true
+          "@ | Set-Content compose.generated.yml
+
+          docker compose -f compose.generated.yml up -d
+
+      - name: Wait for local server on port 10240
+        run: |
+          $maxRetries = 24
+          $retryDelay = 5
+          $success = $false
+      
+          for ($i = 0; $i -lt $maxRetries; $i++) {
+            try {
+              $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5
+              if ($response.StatusCode -eq 200) {
+                Write-Host "Local server is up and responding."
+                $success = $true
+                break
+              } else {
+                Write-Host "Received HTTP $($response.StatusCode), retrying..."
+              }
+            } catch {
+              Write-Host "Local server not responding yet, retrying..."
+            }
+            Start-Sleep -Seconds $retryDelay
+          }
+      
+          if (-not $success) {
+            Write-Error "Local server on port 10240 did not respond within timeout."
+            exit 1
+          }
+          
+      - name: Print Container Logs
+        run: |
+          docker logs nsc-godbolt
+
+      - name: Download cloudflared
+        run: |
+          Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe
+
+      - name: Start tunnel
+        run: |          
+          Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log"
+          
+          $tries = 60
+          $url = $null
+          
+          while ($tries -gt 0) {
+              if (Test-Path cf.log) {
+                  $log = Get-Content cf.log
+                  foreach ($line in $log) {
+                      if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') {
+                          $url = $Matches[0]
+                          Write-Host "::notice title=Tunnel URL::$url"
+                          break
+                      }
+                  }
+                  if ($url) { break }
+              }
+              Start-Sleep -Seconds 1
+              $tries -= 1
+          }
+          
+          if (-not $url) {
+              Write-Error "Could not get tunnel URL from cloudflared log"
+              exit 1
+          }
+
+          $webhookUrl = "$env:DISCORD_WEBHOOK"
+          $runId = "${{ inputs.run_id }}"
+          $actor = "$env:GITHUB_ACTOR"
+          $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss")
+          $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId"
+          $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true"
+
+          $description = @"
+          - tunnel opened for 5 hours, click [here](<$url>) to connect
+          - workflow [logs #${{ github.run_id }}](<$workflowRunURL>)
+          - image downloaded from [run #$runId](<$composedURL>)
+          - dispatched by $actor
+          "@
+
+          $payload = @{
+              embeds = @(
+                  @{
+                      title = "Running NSC Godbolt Container"
+                      description = $description
+                      color = 15844367
+                      footer = @{
+                          text = "sent from GitHub Actions runner"
+                      }
+                      timestamp = (Get-Date).ToString("o")
+                  }
+              )
+          } | ConvertTo-Json -Depth 10
+          
+          if ($sendDiscord) {
+              Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload
+          }
+          
+          Start-Sleep -Seconds 18000

From 90d3579660fbe8f914e1009cc778490bbe5c456a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 30 May 2025 11:10:54 +0700
Subject: [PATCH 255/346] fix scans for level 1+

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 1d386835b9..e4c23ee555 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -179,15 +179,15 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
 
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        subgroup2::exclusive_scan<params_lv1_t> exclusiveScan1;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]);
-            lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
-            lv1_val = inclusiveScan1(lv1_val);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
+            lv1_val = exclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
@@ -304,15 +304,16 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
-        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        if (glsl::gl_SubgroupID() < Config::SubgroupsSize*Config::ItemsPerInvocation_2)
+        const uint32_t lv1_num_invoc = Config::SubgroupsSize*Config::ItemsPerInvocation_2;
+        subgroup2::exclusive_scan<params_lv1_t> exclusiveScan1;
+        if (glsl::gl_SubgroupID() < lv1_num_invoc)
         {
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]);
-            lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
-            lv1_val = inclusiveScan1(lv1_val);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
+            lv1_val = exclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
@@ -325,15 +326,15 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 2 scan
-        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
+        subgroup2::exclusive_scan<params_lv2_t> exclusiveScan2;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i)-1,lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
-            lv2_val = inclusiveScan2(lv2_val);
+            lv2_val = exclusiveScan2(lv2_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
@@ -341,7 +342,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // combine with level 1
-        if (glsl::gl_SubgroupID() < lv1_smem_size)
+        if (glsl::gl_SubgroupID() < lv1_num_invoc)
         {
             vector_lv1_t lv1_val;
             [unroll]

From 203c03a8f52b4cec36f88d6566fdff6d67534b53 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 30 May 2025 14:17:42 +0700
Subject: [PATCH 256/346] some indexing fixes for 3-level reduce/scan

---
 .../builtin/hlsl/workgroup2/arithmetic_config.hlsl  | 13 +++++++------
 .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl    |  6 +++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 2f1a8b06a0..c7832c360a 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -51,9 +51,6 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
     static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2;
-
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
     // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
@@ -61,12 +58,16 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = __ItemsPerVirtualWorkgroup / ItemsPerInvocation_1;
+
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
         0,
         conditional_value<LevelCount==3,uint16_t,
-            SubgroupSize*ItemsPerInvocation_2,
-            0
-            >::value + SubgroupSize*ItemsPerInvocation_1
+            SubgroupSize*ItemsPerInvocation_2+__ItemsPerVirtualWorkgroup,
+            SubgroupSize*ItemsPerInvocation_1
+            >::value
         >::value;
 
     static bool electLast()
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index e4c23ee555..af37908292 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -245,7 +245,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
 
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
+        const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup;
         subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() < Config::SubgroupSize*Config::ItemsPerInvocation_2)
         {
@@ -303,8 +303,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
         const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1;
-        const uint32_t lv1_num_invoc = Config::SubgroupsSize*Config::ItemsPerInvocation_2;
+        const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup;
+        const uint32_t lv1_num_invoc = Config::SubgroupSize*Config::ItemsPerInvocation_2;
         subgroup2::exclusive_scan<params_lv1_t> exclusiveScan1;
         if (glsl::gl_SubgroupID() < lv1_num_invoc)
         {

From 0b163078f8363129a3b34a293f0f1286d2e82791 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 30 May 2025 15:57:21 +0700
Subject: [PATCH 257/346] fix 3-level scan downsweep step

---
 examples_tests                                 |  2 +-
 .../builtin/hlsl/workgroup2/shared_scan.hlsl   | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/examples_tests b/examples_tests
index f202ef5632..93b78108b4 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f202ef563249c172d4a6c699379c6793ae939863
+Subproject commit 93b78108b433cfb85407c5f6816adc4c58b0fb7b
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index af37908292..de55a131b8 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -305,7 +305,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         // level 1 scan
         const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup;
         const uint32_t lv1_num_invoc = Config::SubgroupSize*Config::ItemsPerInvocation_2;
-        subgroup2::exclusive_scan<params_lv1_t> exclusiveScan1;
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() < lv1_num_invoc)
         {
             vector_lv1_t lv1_val;
@@ -313,7 +313,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
-            lv1_val = exclusiveScan1(lv1_val);
+            lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
@@ -333,7 +333,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
-            lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
+            // lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
             lv2_val = exclusiveScan2(lv2_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
@@ -347,16 +347,20 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
+
+            const scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(lv1_val[Config::ItemsPerInvocation_1-1],1), bool(glsl::gl_SubgroupInvocationID()));
 
             scalar_t lv2_scan;
             const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
-            scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv2_scan);
+            scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan));
+            for (uint32_t i = Config::ItemsPerInvocation_1-1; i > 0; i--)
+                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan));
+            scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan));
         }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // combine with level 0
         [unroll]

From aab868be8dca650a1a037a016382691831def6b6 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Fri, 30 May 2025 18:49:34 +0200
Subject: [PATCH 258/346] Update run-nsc.yml, add reverse proxy, require
 authentication to connect to tested NSC Godbolt instance, add options to set
 timeout and default to 1h

to not violate Github ToS we do not allow for public connections and restrict to members of DevshGraphicsProgramming only
---
 .github/workflows/run-nsc.yml | 105 ++++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml
index 07be0d44e9..c886256a83 100644
--- a/.github/workflows/run-nsc.yml
+++ b/.github/workflows/run-nsc.yml
@@ -15,6 +15,17 @@ on:
           - Release
           - RelWithDebInfo
           - Debug
+      tunnelDurationHours:
+        description: "Hours amount the restricted tunnel should stay up"
+        required: true
+        default: "1"
+        type: choice
+        options:
+          - "1"
+          - "2"
+          - "3"
+          - "4"
+          - "5"
       withDiscordMSG:
         description: "Send Discord message after tunnel is up"
         required: true
@@ -44,7 +55,47 @@ jobs:
               docker network create --driver nat docker_default
               if ($LASTEXITCODE -ne 0) { exit 1 }
             }
-    
+
+            $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true"
+            Write-Host "::notice::Should send discord message? $sendDiscord"
+
+      - name: Download Restricted Reverse Proxy binaries, setup NGINX config
+        run: |
+          Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe
+          Invoke-WebRequest -Uri "https://nginx.org/download/nginx-1.24.0.zip" -OutFile nginx.zip
+          Expand-Archive nginx.zip -DestinationPath nginx
+      
+          Remove-Item -Recurse -Force "nginx/nginx-1.24.0/conf"
+          New-Item -ItemType Directory -Path "nginx/nginx-1.24.0/conf" -Force | Out-Null
+
+          '${{ secrets.NSC_BASIC_AUTH_HTPASSWD }}' | Out-File nginx/nginx-1.24.0/conf/.htpasswd -Encoding ascii
+          $htpasswdPath = (Resolve-Path "nginx/nginx-1.24.0/conf/.htpasswd").Path -replace '\\', '/'
+      
+          @"
+          events {}
+      
+          http {
+            server {
+              listen 10241;
+      
+              location / {
+                auth_basic "Restricted Compiler Explorer access for Development & NSC Artifact Tests, downloaded from Nabla actions pipeline";
+                auth_basic_user_file "$htpasswdPath";
+      
+                proxy_pass http://127.0.0.1:10240;
+                proxy_set_header Host `$host;
+                proxy_set_header X-Real-IP `$remote_addr;
+              }
+            }
+          }
+          "@ | Out-File nginx/nginx-1.24.0/conf/nginx.conf -Encoding ascii
+      
+          Write-Host "::group::Generated nginx.conf"
+          Get-Content nginx/nginx-1.24.0/conf/nginx.conf
+          Write-Host "::endgroup::"
+          
+          & "nginx/nginx-1.24.0/nginx.exe" -t -p "nginx/nginx-1.24.0" -c "conf/nginx.conf"
+
       - name: Download NSC Godbolt artifact
         uses: actions/download-artifact@v4
         with:
@@ -107,7 +158,7 @@ jobs:
 
           docker compose -f compose.generated.yml up -d
 
-      - name: Wait for local server on port 10240
+      - name: Wait for NSC container response on port
         run: |
           $maxRetries = 24
           $retryDelay = 5
@@ -117,34 +168,35 @@ jobs:
             try {
               $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5
               if ($response.StatusCode -eq 200) {
-                Write-Host "Local server is up and responding."
+                Write-Host "NSC container is up listening on port 10240 and responding."
                 $success = $true
                 break
               } else {
                 Write-Host "Received HTTP $($response.StatusCode), retrying..."
               }
             } catch {
-              Write-Host "Local server not responding yet, retrying..."
+              Write-Host "NSC container is not responding on port 10240, retrying..."
             }
             Start-Sleep -Seconds $retryDelay
           }
       
           if (-not $success) {
-            Write-Error "Local server on port 10240 did not respond within timeout."
+            Write-Error "No response from NSC container on port 10240, timeout."
             exit 1
           }
           
-      - name: Print Container Logs
+      - name: Print NSC container logs
         run: |
           docker logs nsc-godbolt
 
-      - name: Download cloudflared
+      - name: Start Restricted Tunnel
+        env:
+          DISCORD_ENABLED: ${{ inputs.withDiscordMSG }}
+          TUNNEL_DURATION_HOURS: ${{ inputs.tunnelDurationHours }}
         run: |
-          Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe
-
-      - name: Start tunnel
-        run: |          
-          Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log"
+          Start-Process -NoNewWindow -FilePath .\nginx\nginx-1.24.0\nginx.exe -ArgumentList '-p', (Join-Path $PWD 'nginx/nginx-1.24.0'), '-c', 'conf/nginx.conf'  
+          Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10241", "--logfile", "cf.log"
+          netstat -an | findstr 10241
           
           $tries = 60
           $url = $null
@@ -164,23 +216,27 @@ jobs:
               Start-Sleep -Seconds 1
               $tries -= 1
           }
-          
+
           if (-not $url) {
               Write-Error "Could not get tunnel URL from cloudflared log"
               exit 1
           }
 
           $webhookUrl = "$env:DISCORD_WEBHOOK"
-          $runId = "${{ inputs.run_id }}"
+          $runId = "$env:GITHUB_RUN_ID"
           $actor = "$env:GITHUB_ACTOR"
-          $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss")
           $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId"
-          $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true"
+          $workflowRunURL = "https://github.com/$env:GITHUB_REPOSITORY/actions/runs/$runId"
+          $sendDiscord = "$env:DISCORD_ENABLED" -eq "true"
+          $hours = [int]$env:TUNNEL_DURATION_HOURS
+          $duration = $hours * 3600
+
+          Write-Host "Blocking job for $hours hours"
 
           $description = @"
-          - tunnel opened for 5 hours, click [here](<$url>) to connect
-          - workflow [logs #${{ github.run_id }}](<$workflowRunURL>)
+          - tunnel opened for $hours hours, click [here](<$url>) to connect
+          - requires authentication
+          - workflow [logs #$runId](<$workflowRunURL>)
           - image downloaded from [run #$runId](<$composedURL>)
           - dispatched by $actor
           "@
@@ -191,16 +247,17 @@ jobs:
                       title = "Running NSC Godbolt Container"
                       description = $description
                       color = 15844367
-                      footer = @{
-                          text = "sent from GitHub Actions runner"
-                      }
+                      footer = @{ text = "sent from GitHub Actions runner" }
                       timestamp = (Get-Date).ToString("o")
                   }
               )
           } | ConvertTo-Json -Depth 10
-          
+
           if ($sendDiscord) {
+              Write-Host "Sending Discord webhook..."
               Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload
+          } else {
+              Write-Host "Discord webhook disabled"
           }
           
-          Start-Sleep -Seconds 18000
+          Start-Sleep -Seconds $duration

From 068fc26b724832047ec3038a33fddd5c39fbc1fe Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Fri, 30 May 2025 19:35:02 +0200
Subject: [PATCH 259/346] Update run-nsc.yml, typo I did

---
 .github/workflows/run-nsc.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml
index c886256a83..e52bbae0fd 100644
--- a/.github/workflows/run-nsc.yml
+++ b/.github/workflows/run-nsc.yml
@@ -225,8 +225,8 @@ jobs:
           $webhookUrl = "$env:DISCORD_WEBHOOK"
           $runId = "$env:GITHUB_RUN_ID"
           $actor = "$env:GITHUB_ACTOR"
+          $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
           $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId"
-          $workflowRunURL = "https://github.com/$env:GITHUB_REPOSITORY/actions/runs/$runId"
           $sendDiscord = "$env:DISCORD_ENABLED" -eq "true"
           $hours = [int]$env:TUNNEL_DURATION_HOURS
           $duration = $hours * 3600

From 5bf733671bead2de6216f048005d89c038ea3376 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Fri, 30 May 2025 19:52:12 +0200
Subject: [PATCH 260/346] Update run-nsc.yml, some updates to dc logs

---
 .github/workflows/run-nsc.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml
index e52bbae0fd..456e0d3054 100644
--- a/.github/workflows/run-nsc.yml
+++ b/.github/workflows/run-nsc.yml
@@ -223,10 +223,11 @@ jobs:
           }
 
           $webhookUrl = "$env:DISCORD_WEBHOOK"
-          $runId = "$env:GITHUB_RUN_ID"
+          $thisWorkflowRunID = "${{ github.run_id }}"
+          $artifactWorkflowRunID = "${{ inputs.run_id }}"
+          $thisWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$thisWorkflowRunID"
+          $artifactWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$artifactWorkflowRunID"
           $actor = "$env:GITHUB_ACTOR"
-          $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId"
           $sendDiscord = "$env:DISCORD_ENABLED" -eq "true"
           $hours = [int]$env:TUNNEL_DURATION_HOURS
           $duration = $hours * 3600
@@ -236,8 +237,8 @@ jobs:
           $description = @"
           - tunnel opened for $hours hours, click [here](<$url>) to connect
           - requires authentication
-          - workflow [logs #$runId](<$workflowRunURL>)
-          - image downloaded from [run #$runId](<$composedURL>)
+          - workflow [logs #$thisWorkflowRunID](<$thisWorkflowRunURL>)
+          - image downloaded from [run #$artifactWorkflowRunID](<$artifactWorkflowRunURL>)
           - dispatched by $actor
           "@
 

From 83991b9190173efcf2192e601da161a92058ab20 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 2 Jun 2025 10:28:26 +0700
Subject: [PATCH 261/346] added tuple.hlsl

---
 include/nbl/builtin/hlsl/tuple.hlsl | 61 +++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 include/nbl/builtin/hlsl/tuple.hlsl

diff --git a/include/nbl/builtin/hlsl/tuple.hlsl b/include/nbl/builtin/hlsl/tuple.hlsl
new file mode 100644
index 0000000000..a9c26090ea
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tuple.hlsl
@@ -0,0 +1,61 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_
+
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<typename T0, typename T1=void, typename T2=void> // TODO: in the future use BOOST_PP to make this
+struct tuple
+{
+    T0 t0;
+    T1 t1;
+    T2 t2;
+};
+
+template<uint32_t N, typename Tuple>
+struct tuple_element;
+
+template<typename T0>
+struct tuple<T0,void,void>
+{
+   T0 t0;
+};
+
+template<typename T0, typename T1>
+struct tuple<T0,T1,void>
+{
+   T0 t0;
+   T1 t1;
+};
+// specializations for less and less void elements
+
+// base case
+template<typename Head, typename T1, typename T2>
+struct tuple_element<0,tuple<Head,T1,T2> >
+{
+   using type = Head;
+};
+
+template<typename T0, typename Head, typename T2>
+struct tuple_element<1,tuple<T0,Head,T2> >
+{
+   using type = Head;
+};
+
+template<typename T0, typename T1, typename Head>
+struct tuple_element<2,tuple<T0,T1,Head> >
+{
+   using type = Head;
+};
+
+}
+}
+
+#endif

From 209adb4f51d5646c7545a1615b4635b821921e13 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 2 Jun 2025 11:31:47 +0700
Subject: [PATCH 262/346] added some comments to config funcs for future
 debugging

---
 examples_tests                                        |  2 +-
 .../builtin/hlsl/workgroup2/arithmetic_config.hlsl    | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples_tests b/examples_tests
index 93b78108b4..3a3aaa9fce 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 93b78108b433cfb85407c5f6816adc4c58b0fb7b
+Subproject commit 3a3aaa9fce04cda7726170e2128124d466252a27
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index c7832c360a..90b46b8c07 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -5,6 +5,7 @@
 #define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tuple.hlsl"
 
 namespace nbl 
 {
@@ -52,16 +53,16 @@ struct ArithmeticConfiguration
     static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
 
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
-    // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation;    TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
+    using ItemsPerInvocation = tuple<integral_constant<uint16_t,items_per_invoc_t::value0>,integral_constant<uint16_t,items_per_invoc_t::value1>,integral_constant<uint16_t,items_per_invoc_t::value2> >;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
-    static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = __ItemsPerVirtualWorkgroup / ItemsPerInvocation_1;
 
+    // user specified the shared mem size of uint32_ts
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
         0,
         conditional_value<LevelCount==3,uint16_t,
@@ -75,11 +76,16 @@ struct ArithmeticConfiguration
         return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
     }
 
+    // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups
+    // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex
     static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex)
     {
         return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
     }
 
+    // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
+    // specify the next level to store values for in template param
+    // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
     template<uint16_t level>
     static uint32_t sharedStoreIndex(const uint32_t subgroupID)
     {
@@ -102,6 +108,7 @@ struct ArithmeticConfiguration
         return sharedStoreIndex<level>(virtualID);
     }
 
+    // get the coalesced index in shared mem at the current level
     template<uint16_t level>
     static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component)
     {

From 9cdaa9fd385ffc54c48d973ce11640f3a24b64f1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 2 Jun 2025 13:37:44 +0200
Subject: [PATCH 263/346] change NSC package name, add badges creation & deploy

---
 .github/workflows/build-nabla.yml | 76 +++++++++++++++++++++++++++++--
 .github/workflows/run-nsc.yml     |  4 +-
 README.md                         | 12 +++--
 compose.yml                       |  2 +-
 4 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 67fc9c4401..cedecb3b92 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -5,6 +5,9 @@ on:
   pull_request:
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 concurrency:
   group: push-lock-${{ github.ref }}
   cancel-in-progress: true
@@ -54,10 +57,10 @@ jobs:
         shell: pwsh
         run: |
           $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}"
-          $repo = $env:GITHUB_REPOSITORY
-          $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
-          $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower()
-          $nscTargetTaggedImageLatest = "ghcr.io/${repo}:nsc-godbolt-latest".ToLower()
+          $package = "nabla-shader-compiler-godbolt"
+          $tag = "build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
+          $nscTargetTaggedImage = "ghcr.io/${package}:${tag}".ToLower()
+          $nscTargetTaggedImageLatest = "ghcr.io/${package}:latest".ToLower()
 
           $shouldPushImage = (
             "${{ github.ref }}" -eq "refs/heads/master" -and
@@ -178,4 +181,67 @@ jobs:
       - name: Push images to GHCR
         if: steps.set-prefix.outputs.shouldPushImage == 'True'
         run: |
-          docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }}
\ No newline at end of file
+          docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }}
+
+  update-badges:
+    name: Update Build & Image Badges
+    if: ${{ always() && github.ref == 'refs/heads/master' }}
+    needs: build-windows
+    runs-on: windows-2022
+    permissions:
+      contents: write
+
+    steps:
+      - name: Create Build Badge
+        run: |
+          $jobStatus = "${{ needs.build-windows.result }}"
+          $buildMsg = if ($jobStatus -eq "success") { "passing" } else { "failing" }
+          $buildColor = if ($jobStatus -eq "success") { "brightgreen" } else { "red" }
+
+          $buildBadge = @{
+            schemaVersion = 1
+            label = "build"
+            message = $buildMsg
+            color = $buildColor
+          } | ConvertTo-Json -Depth 2
+
+          $buildPath = ".badge-public/nabla"
+          New-Item -ItemType Directory -Path $buildPath -Force | Out-Null
+          $buildBadge | Set-Content -Path "$buildPath/build.json" -Encoding utf8
+
+      - name: Create Image Size Badge
+        run: |
+          $image = "ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest"
+          $manifest = docker manifest inspect $image | ConvertFrom-Json
+
+          if ($manifest.manifests) {
+            $totalSize = ($manifest.manifests | Measure-Object -Property size -Sum).Sum
+          } elseif ($manifest.layers) {
+            $totalSize = ($manifest.layers | Measure-Object -Property size -Sum).Sum
+          } else {
+            Write-Error "No valid size information found in manifest."
+            exit 1
+          }
+
+          $sizeMB = [Math]::Round($totalSize / 1MB, 2)
+          $size = "$sizeMB MB"
+
+          $imageBadge = @{
+            schemaVersion = 1
+            label = $image
+            message = $size
+            color = "blue"
+          } | ConvertTo-Json -Depth 2
+
+          $imagePath = ".badge-public/packages/nabla-shader-compiler-nsc"
+          New-Item -ItemType Directory -Path $imagePath -Force | Out-Null
+          $imageBadge | Set-Content -Path "$imagePath/image-badge.json" -Encoding utf8
+
+      - name: Deploy Badges
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_branch: badges
+          publish_dir: .badge-public
+          keep_files: true
+          commit_message: "[CI] badges update"
\ No newline at end of file
diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml
index 456e0d3054..d5f9f74c2b 100644
--- a/.github/workflows/run-nsc.yml
+++ b/.github/workflows/run-nsc.yml
@@ -121,11 +121,11 @@ jobs:
       - name: Generate and run Docker Compose with matched image
         run: |
           $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" |
-            Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } |
+            Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*" } |
             Select-Object -First 1
 
           if (-not $imageName) {
-            Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*"
+            Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*"
             exit 1
           }
 
diff --git a/README.md b/README.md
index 2b85c9c460..f49fede7d7 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,15 @@
 <div align="center">
    <img alt="Click to see the source" height="200" src="nabla-glow.svg" width="200" />
 </div>
-<div align="center">
-  <img alt="Click to see the source" height="200" src="https://github.com/user-attachments/assets/81f15d9b-0b9b-4ecc-981a-6e43e3b4c49b" width="700" />
-</div>
+
+<p align="center">
+  <a href="https://github.com/Devsh-Graphics-Programming/Nabla/actions">
+    <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/nabla/build.json" alt="Build Status" /></a>
+  <a href="https://opensource.org/licenses/Apache-2.0">
+    <img src="https://img.shields.io/badge/license-Apache%202.0-blue" alt="License: Apache 2.0" /></a>
+  <a href="https://discord.gg/Dx83Cu7suD">
+    <img src="https://img.shields.io/discord/308323056592486420?label=discord&logo=discord&logoColor=white&color=7289DA" alt="Join our Discord" /></a>
+</p>
 
 # Table of Contents
 
diff --git a/compose.yml b/compose.yml
index 8d6f1bc64a..3f32e8d1b5 100644
--- a/compose.yml
+++ b/compose.yml
@@ -1,7 +1,7 @@
 services:
   nsc:
     container_name: nsc-godbolt
-    image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest
+    image: ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:latest
     isolation: process
     ports:
       - "80:10240"

From 67a9a07d43c41d4f7746342a53c8e5731c6d5dd3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Mon, 2 Jun 2025 14:35:44 +0200
Subject: [PATCH 264/346] Update build-nabla.yml, typo

---
 .github/workflows/build-nabla.yml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index cedecb3b92..e116e3f5ec 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -57,10 +57,11 @@ jobs:
         shell: pwsh
         run: |
           $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}"
+          $owner = "${{ github.repository_owner }}"
           $package = "nabla-shader-compiler-godbolt"
           $tag = "build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
-          $nscTargetTaggedImage = "ghcr.io/${package}:${tag}".ToLower()
-          $nscTargetTaggedImageLatest = "ghcr.io/${package}:latest".ToLower()
+          $nscTargetTaggedImage = "ghcr.io/${owner}/${package}:${tag}".ToLower()
+          $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower()
 
           $shouldPushImage = (
             "${{ github.ref }}" -eq "refs/heads/master" -and
@@ -211,6 +212,10 @@ jobs:
 
       - name: Create Image Size Badge
         run: |
+          $owner = "${{ github.repository_owner }}"
+          $package = "nabla-shader-compiler-godbolt"
+          $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower()
+
           $image = "ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest"
           $manifest = docker manifest inspect $image | ConvertFrom-Json
 
@@ -244,4 +249,4 @@ jobs:
           publish_branch: badges
           publish_dir: .badge-public
           keep_files: true
-          commit_message: "[CI] badges update"
\ No newline at end of file
+          commit_message: "[CI] badges update"

From 9178ec3ae3edc9c9e8a1f37817b5eb568f16d408 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 2 Jun 2025 15:57:08 +0200
Subject: [PATCH 265/346] update badge labels, discord links and
 tools/nsc/docker/README.md

---
 .github/workflows/build-nabla.yml |  7 ++++---
 README.md                         |  2 +-
 tools/nsc/docker/README.md        | 14 ++++++++++++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index e116e3f5ec..8988fe6df6 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -193,6 +193,9 @@ jobs:
       contents: write
 
     steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
       - name: Create Build Badge
         run: |
           $jobStatus = "${{ needs.build-windows.result }}"
@@ -214,9 +217,7 @@ jobs:
         run: |
           $owner = "${{ github.repository_owner }}"
           $package = "nabla-shader-compiler-godbolt"
-          $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower()
-
-          $image = "ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest"
+          $image = "ghcr.io/${owner}/${package}:latest".ToLower()
           $manifest = docker manifest inspect $image | ConvertFrom-Json
 
           if ($manifest.manifests) {
diff --git a/README.md b/README.md
index f49fede7d7..a696846b30 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
     <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/nabla/build.json" alt="Build Status" /></a>
   <a href="https://opensource.org/licenses/Apache-2.0">
     <img src="https://img.shields.io/badge/license-Apache%202.0-blue" alt="License: Apache 2.0" /></a>
-  <a href="https://discord.gg/Dx83Cu7suD">
+  <a href="https://discord.gg/krsBcABm7u">
     <img src="https://img.shields.io/discord/308323056592486420?label=discord&logo=discord&logoColor=white&color=7289DA" alt="Join our Discord" /></a>
 </p>
 
diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md
index afd8b0f8b7..d44eea9f81 100644
--- a/tools/nsc/docker/README.md
+++ b/tools/nsc/docker/README.md
@@ -4,6 +4,17 @@
 
 https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263
 
+<p align="center">
+  <a href="https://github.com/Devsh-Graphics-Programming/Nabla/actions">
+    <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/packages/nabla-shader-compiler-nsc/image-badge.json" alt="Image Status" /></a>
+  <a href="https://github.com/Devsh-Graphics-Programming/Nabla/actions">
+    <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/nabla/build.json" alt="Build Status" /></a>
+  <a href="https://opensource.org/licenses/Apache-2.0">
+    <img src="https://img.shields.io/badge/license-Apache%202.0-blue" alt="License: Apache 2.0" /></a>
+  <a href="https://discord.gg/krsBcABm7u">
+    <img src="https://img.shields.io/discord/308323056592486420?label=discord&logo=discord&logoColor=white&color=7289DA" alt="Join our Discord" /></a>
+</p>
+
 ## Requirements
 
 - Configured [***Docker***](https://docs.docker.com/desktop/setup/install/windows-install/) for Windows Containers
@@ -12,6 +23,9 @@ https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263
 > [!TIP]
 > type `cmd /ver` to see your build version
 
+> [!WARNING]  
+> You cannot run it on Windows Home Edition as it doesn't have `Containers` feature, visit Microsoft [docs](<https://learn.microsoft.com/en-gb/virtualization/windowscontainers/quick-start/set-up-environment?tabs=dockerce>) for more details
+
 > [!CAUTION]  
 > Hyper-V is **NOT** supported, you must run NSC Godbolt container as process
 

From 7d77d30baacb673d7f1ca63e0e015ea984d8455d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Jun 2025 12:10:18 +0700
Subject: [PATCH 266/346] change indexing to uint16_t

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    |  28 ++--
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 137 +++++++++---------
 2 files changed, 84 insertions(+), 81 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 90b46b8c07..a9fdcfe0a4 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -58,15 +58,19 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = __ItemsPerVirtualWorkgroup / ItemsPerInvocation_1;
+    // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
+    // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value<LevelCount==3,uint16_t,
+        mpl::max_v<uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize>,
+        SubgroupSize*ItemsPerInvocation_1>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1;
 
     // user specified the shared mem size of uint32_ts
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
         0,
         conditional_value<LevelCount==3,uint16_t,
-            SubgroupSize*ItemsPerInvocation_2+__ItemsPerVirtualWorkgroup,
+            SubgroupSize*ItemsPerInvocation_2+LevelInputCount_1,
             SubgroupSize*ItemsPerInvocation_1
             >::value
         >::value;
@@ -78,7 +82,7 @@ struct ArithmeticConfiguration
 
     // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups
     // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex
-    static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex)
+    static uint16_t virtualSubgroupID(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
     {
         return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
     }
@@ -87,30 +91,30 @@ struct ArithmeticConfiguration
     // specify the next level to store values for in template param
     // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
     template<uint16_t level>
-    static uint32_t sharedStoreIndex(const uint32_t subgroupID)
+    static uint16_t sharedStoreIndex(const uint16_t subgroupID)
     {
-        uint32_t offsetBySubgroup;
+        uint16_t offsetBySubgroup;
         if (level == LevelCount-1)
             offsetBySubgroup = SubgroupSize;
         else
             offsetBySubgroup = __SubgroupsPerVirtualWorkgroup;
 
         if (level<2)
-            return (subgroupID & (ItemsPerInvocation_1-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1);
+            return (subgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1);
         else
-            return (subgroupID & (ItemsPerInvocation_2-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2);
+            return (subgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2);
     }
 
     template<uint16_t level>
-    static uint32_t sharedStoreIndexFromVirtualIndex(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex)
+    static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
     {
-        const uint32_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
+        const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
         return sharedStoreIndex<level>(virtualID);
     }
 
     // get the coalesced index in shared mem at the current level
     template<uint16_t level>
-    static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component)
+    static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
         if (level == LevelCount-1)
             return component * SubgroupSize + invocationIndex;
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index de55a131b8..78ed124baf 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -43,7 +43,7 @@ struct reduce<Config, BinOp, 1, device_capabilities>
 
         subgroup2::reduction<params_t> reduction;
         vector_t value;
-        dataAccessor.template get<vector_t, uint32_t>(glsl::gl_SubgroupInvocationID(), value);
+        dataAccessor.template get<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
         return reduction(value);
     }
 };
@@ -62,7 +62,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
         using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
 
         vector_t value;
-        dataAccessor.template get<vector_t, uint32_t>(glsl::gl_SubgroupInvocationID(), value);
+        dataAccessor.template get<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
         if (Exclusive)
         {
             subgroup2::exclusive_scan<params_t> excl_scan;
@@ -73,7 +73,7 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
             subgroup2::inclusive_scan<params_t> incl_scan;
             value = incl_scan(value);
         }
-        dataAccessor.template set<vector_t>(glsl::gl_SubgroupInvocationID(), value);
+        dataAccessor.template set<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
     }
 };
 
@@ -88,19 +88,19 @@ struct reduce<Config, BinOp, 2, device_capabilities>
     template<class DataAccessor, class ScratchAccessor, class Params, typename vector_t>
     static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
         subgroup2::reduction<Params> reduction0;
         [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_t scan_local;
-            dataAccessor.template get<vector_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
-                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -116,19 +116,19 @@ struct reduce<Config, BinOp, 2, device_capabilities>
 
         __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
 
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv1_t lv1_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
 
             if (Config::electLast())
-                scratchAccessor.template set<scalar_t, uint32_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t, uint16_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -148,20 +148,20 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
     template<class DataAccessor, class ScratchAccessor, class Params, typename vector_t>
     static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         subgroup2::inclusive_scan<Params> inclusiveScan0;
         // level 0 scan
         [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_t value;
-            dataAccessor.template get<vector_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
-            dataAccessor.template set<vector_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
-                scratchAccessor.template set<scalar_t, uint32_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -177,48 +177,48 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
 
         __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
 
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         subgroup2::exclusive_scan<params_lv1_t> exclusiveScan1;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv1_t lv1_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = exclusiveScan1(lv1_val);
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // combine with level 0
         [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
             scalar_t left;
-            scratchAccessor.template get<scalar_t, uint32_t>(bankedIndex,left);
+            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
                 [unroll]
-                for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
                     value[i] = binop(left, value[i-1]);
                 value[0] = binop(left, left_last_elem);
             }
             else
             {
                 [unroll]
-                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     value[i] = binop(left, value[i]);
             }
-            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
         }
     }
 };
@@ -243,21 +243,21 @@ struct reduce<Config, BinOp, 3, device_capabilities>
 
         reduce<Config, BinOp, 2, device_capabilities>::template __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
 
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
         const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup;
         subgroup2::reduction<params_lv1_t> reduction1;
-        if (glsl::gl_SubgroupID() < Config::SubgroupSize*Config::ItemsPerInvocation_2)
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
             vector_lv1_t lv1_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
-                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
+                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -268,16 +268,16 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         {
             vector_lv2_t lv2_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             if (Config::electLast())
-                scratchAccessor.template set<scalar_t, uint32_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
+                scratchAccessor.template set<scalar_t, uint16_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         scalar_t reduce_val;
-        scratchAccessor.template get<scalar_t, uint32_t>(0,reduce_val);
+        scratchAccessor.template get<scalar_t, uint16_t>(0,reduce_val);
         return reduce_val;
     }
 };
@@ -301,26 +301,25 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
         scan<Config, BinOp, Exclusive, 2, device_capabilities>::template __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
 
-        const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup;
-        const uint32_t lv1_num_invoc = Config::SubgroupSize*Config::ItemsPerInvocation_2;
+        const uint32_t lv1_smem_size = Config::LevelInputCount_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
-        if (glsl::gl_SubgroupID() < lv1_num_invoc)
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
             vector_lv1_t lv1_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
             if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
-                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
+                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -331,62 +330,62 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         {
             vector_lv2_t lv2_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             // lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
             lv2_val = exclusiveScan2(lv2_val);
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t, uint32_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // combine with level 1
-        if (glsl::gl_SubgroupID() < lv1_num_invoc)
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
             vector_lv1_t lv1_val;
             [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
 
             const scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(lv1_val[Config::ItemsPerInvocation_1-1],1), bool(glsl::gl_SubgroupInvocationID()));
 
             scalar_t lv2_scan;
-            const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID());
-            scratchAccessor.template get<scalar_t, uint32_t>(lv1_smem_size+bankedIndex, lv2_scan);
+            const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
+            scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
-            for (uint32_t i = Config::ItemsPerInvocation_1-1; i > 0; i--)
-                scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan));
-            scratchAccessor.template set<scalar_t, uint32_t>(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan));
+            for (uint16_t i = Config::ItemsPerInvocation_1-1; i > 0; i--)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan));
+            scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan));
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // combine with level 0
         [unroll]
-        for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
             vector_lv0_t value;
-            dataAccessor.template get<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
             scalar_t left;
-            scratchAccessor.template get<scalar_t, uint32_t>(bankedIndex,left);
+            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
                 [unroll]
-                for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
                     value[i] = binop(left, value[i-1]);
                 value[0] = binop(left, left_last_elem);
             }
             else
             {
                 [unroll]
-                for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
                     value[i] = binop(left, value[i]);
             }
-            dataAccessor.template set<vector_lv0_t, uint32_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            dataAccessor.template set<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
         }
     }
 };

From 7b15a544161cd8a6fb2011dac615928922d42c92 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Jun 2025 15:49:02 +0700
Subject: [PATCH 267/346] do inclusive scan on upsweep and shift left on
 downsweep

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 41 ++++++++++---------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 78ed124baf..d473e466b9 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -179,15 +179,14 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        subgroup2::exclusive_scan<params_lv1_t> exclusiveScan1;
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv1_t lv1_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
-            // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
-            lv1_val = exclusiveScan1(lv1_val);
+            lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
@@ -201,9 +200,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
             scalar_t left;
-            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            if (idx != 0 || glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            else
+                left = BinOp::identity;
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
@@ -245,7 +247,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup;
+        const uint32_t lv1_smem_size = Config::LevelInputCount_1;
         subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
@@ -311,7 +313,6 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
-            // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
@@ -325,15 +326,14 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
         // level 2 scan
-        subgroup2::exclusive_scan<params_lv2_t> exclusiveScan2;
+        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
         if (glsl::gl_SubgroupID() == 0)
         {
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
-            // lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex));
-            lv2_val = exclusiveScan2(lv2_val);
+            lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
                 scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
@@ -344,20 +344,18 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
             vector_lv1_t lv1_val;
+            scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex-uint16_t(1u), Config::ItemsPerInvocation_1-uint16_t(1u)), lv1_val[0]);
             [unroll]
-            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
-
-            const scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(lv1_val[Config::ItemsPerInvocation_1-1],1), bool(glsl::gl_SubgroupInvocationID()));
+            for (uint16_t i = 1; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i-uint16_t(1u)), lv1_val[i]);
 
             scalar_t lv2_scan;
-            const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
+            const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
             scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
-            for (uint16_t i = Config::ItemsPerInvocation_1-1; i > 0; i--)
-                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan));
-            scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan));
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan));
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -368,9 +366,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv0_t value;
             dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
-            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx);
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
             scalar_t left;
-            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            if (idx != 0 || glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            else
+                left = BinOp::identity;
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));

From 37aa99baee12a87bcb351d74988e7a6349317e6e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Jun 2025 16:46:32 +0700
Subject: [PATCH 268/346] some adjustments to config and func usages

---
 .../builtin/hlsl/workgroup2/arithmetic.hlsl   |  6 ++--
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 28 +++++++++----------
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 14 ++++------
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index 643f8d123e..62a9fb7bef 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -17,7 +17,7 @@ namespace hlsl
 namespace workgroup2
 {
 
-template<class Config, class BinOp, class device_capabilities=void>
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
 struct reduction
 {
     using scalar_t = typename BinOp::type_t;
@@ -30,7 +30,7 @@ struct reduction
     }
 };
 
-template<class Config, class BinOp, class device_capabilities=void>
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
 struct inclusive_scan
 {
     using scalar_t = typename BinOp::type_t;
@@ -43,7 +43,7 @@ struct inclusive_scan
     }
 };
 
-template<class Config, class BinOp, class device_capabilities=void>
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
 struct exclusive_scan
 {
     using scalar_t = typename BinOp::type_t;
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index a9fdcfe0a4..e2cf846d6c 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -36,6 +36,8 @@ struct items_per_invocation
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
+
+    using ItemsPerInvocation = tuple<integral_constant<uint16_t,value0>,integral_constant<uint16_t,value1>,integral_constant<uint16_t,value2> >;
 };
 }
 
@@ -53,26 +55,24 @@ struct ArithmeticConfiguration
     static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
 
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
-    using ItemsPerInvocation = tuple<integral_constant<uint16_t,items_per_invoc_t::value0>,integral_constant<uint16_t,items_per_invoc_t::value1>,integral_constant<uint16_t,items_per_invoc_t::value2> >;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
+    static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
 
-    // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value<LevelCount==3,uint16_t,
         mpl::max_v<uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize>,
         SubgroupSize*ItemsPerInvocation_1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1;
 
-    // user specified the shared mem size of uint32_ts
+    // user specified the shared mem size of Scalars
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
         0,
         conditional_value<LevelCount==3,uint16_t,
-            SubgroupSize*ItemsPerInvocation_2+LevelInputCount_1,
-            SubgroupSize*ItemsPerInvocation_1
-            >::value
+            LevelInputCount_2,
+            0
+            >::value + LevelInputCount_1
         >::value;
 
     static bool electLast()
@@ -90,8 +90,8 @@ struct ArithmeticConfiguration
     // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
     // specify the next level to store values for in template param
     // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
-    template<uint16_t level>
-    static uint16_t sharedStoreIndex(const uint16_t subgroupID)
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
     {
         uint16_t offsetBySubgroup;
         if (level == LevelCount-1)
@@ -99,13 +99,13 @@ struct ArithmeticConfiguration
         else
             offsetBySubgroup = __SubgroupsPerVirtualWorkgroup;
 
-        if (level<2)
-            return (subgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1);
+        if (level==2)
+            return LevelInputCount_1 + (virtualSubgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (virtualSubgroupID/ItemsPerInvocation_2);
         else
-            return (subgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2);
+            return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (virtualSubgroupID/ItemsPerInvocation_1);
     }
 
-    template<uint16_t level>
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
     {
         const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
@@ -113,7 +113,7 @@ struct ArithmeticConfiguration
     }
 
     // get the coalesced index in shared mem at the current level
-    template<uint16_t level>
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
         if (level == LevelCount-1)
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index d473e466b9..4edb5ae9ff 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -247,7 +247,6 @@ struct reduce<Config, BinOp, 3, device_capabilities>
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::LevelInputCount_1;
         subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
@@ -259,7 +258,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             if (Config::electLast())
             {
                 const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
-                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -271,7 +270,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             if (Config::electLast())
                 scratchAccessor.template set<scalar_t, uint16_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
@@ -305,7 +304,6 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::LevelInputCount_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
@@ -320,7 +318,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             if (Config::electLast())
             {
                 const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
-                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -332,11 +330,11 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -351,7 +349,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
             scalar_t lv2_scan;
             const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
-            scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv2_scan);
+            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
 
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--)

From eaffe98a29f5a0968bcecb2add6cb27db91d6602 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Tue, 3 Jun 2025 14:43:02 +0200
Subject: [PATCH 269/346] Update compose.yml

---
 compose.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compose.yml b/compose.yml
index 3f32e8d1b5..f9444275f4 100644
--- a/compose.yml
+++ b/compose.yml
@@ -13,4 +13,9 @@ services:
       - type: bind
         source: C:\Windows\System32
         target: C:\mount\Windows\System32
-        read_only: true
\ No newline at end of file
+        read_only: true
+
+networks:
+  default:
+    external: true
+    name: docker_default

From d514007886f35ec82d26d40ab15debcb36548324 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Tue, 3 Jun 2025 15:28:03 +0200
Subject: [PATCH 270/346] Update compose.yml, restart: always to boot after
 host wakes up

---
 compose.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compose.yml b/compose.yml
index f9444275f4..c80bdb4319 100644
--- a/compose.yml
+++ b/compose.yml
@@ -14,6 +14,7 @@ services:
         source: C:\Windows\System32
         target: C:\mount\Windows\System32
         read_only: true
+    restart: always
 
 networks:
   default:

From f09ca19f5fff3a089f3ff91be780ab7dbcfd8f98 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com>
Date: Tue, 3 Jun 2025 16:15:38 +0200
Subject: [PATCH 271/346] Update build-nabla.yml, add deploy-production job

---
 .github/workflows/build-nabla.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 8988fe6df6..3e8e0b4dd0 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -251,3 +251,19 @@ jobs:
           publish_dir: .badge-public
           keep_files: true
           commit_message: "[CI] badges update"
+
+  deploy-production:
+    name: Deploy to production host
+    if: ${{ always() && github.ref == 'refs/heads/master' }}
+    needs: build-windows
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Pull latest images, re-run containers
+        uses: appleboy/ssh-action@v1
+        with:
+          host: ${{ secrets.CE_HOST }}
+          username: ${{ secrets.CE_USER }}
+          key: ${{ secrets.CE_KEY }}
+          script: |
+            powershell -NoLogo -NoProfile -ExecutionPolicy Bypass -NoExit -File C:\Scripts\startup-docker.ps1

From 3b3d45c83c7ae6f1a4ae05a3fdd69844a4b94bf1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 4 Jun 2025 01:05:20 +0200
Subject: [PATCH 272/346] always set the callback back, because even if it were
 empty it needs to be empty again

---
 src/nbl/video/utilities/CAssetConverter.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 4aa631c746..0ef13633da 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -5344,8 +5344,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				retval.set({params.transfer->scratchSemaphore.semaphore,params.transfer->scratchSemaphore.value});
 		}
 		// reset original callback
-		if (bool(origXferStallCallback))
-			params.transfer->overflowCallback = std::move(origXferStallCallback);
+		params.transfer->overflowCallback = std::move(origXferStallCallback);
 		
 		// Its too dangerous to leave an Intended Transfer Submit hanging around that needs to be submitted for Compute to make forward progress outside of this utility,
 		// and doing transfer-signals-after-compute-wait timeline sema tricks are not and option because:

From da6c3134e342eb37517f78974c8febe5e26ec2ca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 4 Jun 2025 11:14:06 +0700
Subject: [PATCH 273/346] split out level 0 scans into its own struct

---
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 79 +++++++++++--------
 1 file changed, 47 insertions(+), 32 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 4edb5ae9ff..329542fa18 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -77,16 +77,15 @@ struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
     }
 };
 
-// 2-level scans
+// do level 0 scans for 2- and 3-level scans (same code)
 template<class Config, class BinOp, class device_capabilities>
-struct reduce<Config, BinOp, 2, device_capabilities>
+struct reduce_level0
 {
     using scalar_t = typename BinOp::type_t;
-    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
-    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
 
-    template<class DataAccessor, class ScratchAccessor, class Params, typename vector_t>
-    static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    template<class DataAccessor, class ScratchAccessor, class Params>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
@@ -104,7 +103,45 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    };
+};
+
+template<class Config, class BinOp, class device_capabilities>
+struct scan_level0
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor, class Params>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        subgroup2::inclusive_scan<Params> inclusiveScan0;
+        // level 0 scan
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_t value;
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            value = inclusiveScan0(value);
+            dataAccessor.template set<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
     }
+};
+
+// 2-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 2, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
     template<class DataAccessor, class ScratchAccessor>
     scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
@@ -114,7 +151,7 @@ struct reduce<Config, BinOp, 2, device_capabilities>
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
-        __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -145,28 +182,6 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
     using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
     using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
 
-    template<class DataAccessor, class ScratchAccessor, class Params, typename vector_t>
-    static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
-    {
-        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
-        subgroup2::inclusive_scan<Params> inclusiveScan0;
-        // level 0 scan
-        [unroll]
-        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
-        {
-            vector_t value;
-            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            value = inclusiveScan0(value);
-            dataAccessor.template set<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            if (Config::electLast())
-            {
-                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
-                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
-            }
-        }
-        scratchAccessor.workgroupExecutionAndMemoryBarrier();
-    }
-
     template<class DataAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
@@ -175,7 +190,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
-        __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -243,7 +258,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        reduce<Config, BinOp, 2, device_capabilities>::template __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -300,7 +315,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        scan<Config, BinOp, Exclusive, 2, device_capabilities>::template __doLevel0<DataAccessor, ScratchAccessor, params_lv0_t, vector_lv0_t>(dataAccessor, scratchAccessor);
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan

From e230d06aaea58f47d7ec5059990f862c4230c246 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 4 Jun 2025 15:34:40 +0700
Subject: [PATCH 274/346] fixes to 3 level scan

---
 .../builtin/hlsl/workgroup2/arithmetic_config.hlsl   |  6 +++++-
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 12 +++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index e2cf846d6c..aecd489beb 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -116,8 +116,12 @@ struct ArithmeticConfiguration
     template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
+        uint16_t smem_offset = 0u;
+        if (level == 2)
+            smem_offset += LevelInputCount_1;
+
         if (level == LevelCount-1)
-            return component * SubgroupSize + invocationIndex;
+            return component * SubgroupSize + invocationIndex + smem_offset;
         else
             return component * __SubgroupsPerVirtualWorkgroup + invocationIndex;
     }
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 329542fa18..d1627e0752 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -357,17 +357,19 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
             vector_lv1_t lv1_val;
-            scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex-uint16_t(1u), Config::ItemsPerInvocation_1-uint16_t(1u)), lv1_val[0]);
             [unroll]
-            for (uint16_t i = 1; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i-uint16_t(1u)), lv1_val[i]);
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
 
             scalar_t lv2_scan;
             const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
-            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
+            if (glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
+            else
+                lv2_scan = BinOp::identity;
 
             [unroll]
-            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--)
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan));
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();

From 3da175daca07a49ffad2672b1d3e74b46221e13b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Jun 2025 10:53:40 +0700
Subject: [PATCH 275/346] padding to shared mem indexing to avoid bank conflict

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 33 ++++++++++---------
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  |  7 +---
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index aecd489beb..0177863b11 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -70,10 +70,11 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
         0,
         conditional_value<LevelCount==3,uint16_t,
-            LevelInputCount_2,
+            LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1,
             0
             >::value + LevelInputCount_1
         >::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3,uint16_t,SubgroupSize-1,0>::value;
 
     static bool electLast()
     {
@@ -90,22 +91,22 @@ struct ArithmeticConfiguration
     // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
     // specify the next level to store values for in template param
     // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
-    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    template<uint16_t level>// NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
     {
-        uint16_t offsetBySubgroup;
+        uint16_t nextLevelInvocationCount;
         if (level == LevelCount-1)
-            offsetBySubgroup = SubgroupSize;
+            nextLevelInvocationCount = SubgroupSize;
         else
-            offsetBySubgroup = __SubgroupsPerVirtualWorkgroup;
+            nextLevelInvocationCount = __SubgroupsPerVirtualWorkgroup;
 
         if (level==2)
-            return LevelInputCount_1 + (virtualSubgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (virtualSubgroupID/ItemsPerInvocation_2);
+            return LevelInputCount_1 + ((SubgroupSize-uint16_t(1u))*ItemsPerInvocation_1) + (virtualSubgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * nextLevelInvocationCount + (virtualSubgroupID/ItemsPerInvocation_2);
         else
-            return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (virtualSubgroupID/ItemsPerInvocation_1);
+            return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * (nextLevelInvocationCount+__padding) + (virtualSubgroupID/ItemsPerInvocation_1) + virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1);
     }
 
-    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    template<uint16_t level>// NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
     {
         const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
@@ -113,17 +114,19 @@ struct ArithmeticConfiguration
     }
 
     // get the coalesced index in shared mem at the current level
-    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    template<uint16_t level>// NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
-        uint16_t smem_offset = 0u;
-        if (level == 2)
-            smem_offset += LevelInputCount_1;
-
+        uint16_t levelInvocationCount;
         if (level == LevelCount-1)
-            return component * SubgroupSize + invocationIndex + smem_offset;
+            levelInvocationCount = SubgroupSize;
+        else
+            levelInvocationCount = __SubgroupsPerVirtualWorkgroup;
+
+        if (level==2)
+            return LevelInputCount_1 + ((SubgroupSize-uint16_t(1u))*ItemsPerInvocation_1) + component * levelInvocationCount + invocationIndex + invocationIndex/SubgroupSize;
         else
-            return component * __SubgroupsPerVirtualWorkgroup + invocationIndex;
+            return component * (levelInvocationCount+__padding) + invocationIndex + invocationIndex/SubgroupSize;
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index d1627e0752..79c62399d2 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -330,11 +330,6 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
-            if (Config::electLast())
-            {
-                const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
-                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
-            }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -345,7 +340,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(((invocationIndex*Config::ItemsPerInvocation_1)+i+1)*Config::SubgroupSize-1, Config::ItemsPerInvocation_1-1),lv2_val[i]);
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)

From 32732e784f835787f724593675c9445bd0742ed7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Jun 2025 12:16:23 +0700
Subject: [PATCH 276/346] fix padding bugs

---
 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 79c62399d2..80dec1b85c 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -337,10 +337,11 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
         if (glsl::gl_SubgroupID() == 0)
         {
+            const uint16_t one = uint16_t(1u);
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(((invocationIndex*Config::ItemsPerInvocation_1)+i+1)*Config::SubgroupSize-1, Config::ItemsPerInvocation_1-1),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>((invocationIndex*Config::ItemsPerInvocation_2+i+one)*Config::SubgroupSize-one, Config::ItemsPerInvocation_1-one),lv2_val[i]);
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)

From 1fc684d74ce6463944bb7817959992f183d23dc2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 6 Jun 2025 17:36:04 +0700
Subject: [PATCH 277/346] Fix AssetConverter after merge

---
 src/nbl/video/utilities/CAssetConverter.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index d07e305777..8204a61e27 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -2415,8 +2415,12 @@ struct conversions_t
 				if (!deferredAllocator->request(output,constrainMask))
 					return;
 			}
-			// set debug names on everything!
-			setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID);
+
+			if constexpr (!std::is_same_v<AssetType, IShader>)
+			{
+              // set debug names on everything
+              setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID);
+			}
 		}
 
 		// Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object)
@@ -3042,10 +3046,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					.writeCache = inputs.writeShaderCache
 				};
 
-				// no one depend on the converted IShaders so we need to hold a smart ptr into them somewhere.
-				// This is to prevent m_stagingCache to hold a dangling pointer into IShader
-				retval.m_shaders.reserve(gpuObjUniqueCopyGroupIDs.size());
-
 				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
@@ -3544,7 +3544,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		pruneStaging.template operator()<ICPUPipelineLayout>();
 		pruneStaging.template operator()<ICPUDescriptorSetLayout>();
 		pruneStaging.template operator()<ICPUSampler>();
-		pruneStaging.template operator()<ICPUShader>();
+		pruneStaging.template operator()<IShader>();
 		pruneStaging.template operator()<ICPUImageView>();
 		pruneStaging.template operator()<ICPUBufferView>();
 		pruneStaging.template operator()<ICPUImage>();
@@ -3667,7 +3667,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	};
 
 	// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
-	core::unordered_map<const IBackendObject*, uint32_t> outputReverseMap;
+	core::unordered_map<const IReferenceCounted*, uint32_t> outputReverseMap;
 	core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void
 		{
 			uint32_t i = 0;

From 7a2065aacd811cb5a2e56e97fbedc4e5fbfeccb9 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Jun 2025 13:48:39 +0700
Subject: [PATCH 278/346] update to latest example

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 6581ed496d..1710b69862 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 6581ed496d2fc41cae1dc5c9ceba10f3bdfc5135
+Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8

From 5c2f55b34235ceb4e9e62d37522a78ac9e6c74b0 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:27:28 +0700
Subject: [PATCH 279/346] Fix pipeline creation in full screen triangle pass

---
 .../ext/FullScreenTriangle/FullScreenTriangle.h    | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
index 4e7147c904..1abebf23ea 100644
--- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
+++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
@@ -40,7 +40,7 @@ struct ProtoPipeline final
 		inline operator bool() const {return m_vxShader.get();}
 
 		inline core::smart_refctd_ptr<video::IGPUGraphicsPipeline> createPipeline(
-			const asset::IPipelineBase::SShaderSpecInfo& fragShader,
+			const video::IGPUPipelineBase::SShaderSpecInfo& fragShader,
 			video::IGPUPipelineLayout* layout,
 			video::IGPURenderpass* renderpass,
 			const uint32_t subpassIx=0,
@@ -58,17 +58,13 @@ struct ProtoPipeline final
 			{
 				const auto orientationAsUint32 = static_cast<uint32_t>(swapchainTransform);
 
-        asset::IPipelineBase::SShaderSpecInfo::spec_constant_map_t specConstants;
-				specConstants[0] = {.data=&orientationAsUint32,.size=sizeof(orientationAsUint32)};
-
-				const asset::IPipelineBase::SShaderSpecInfo shaders[2] = {
-					{.shader=m_vxShader.get(), .entryPoint = "main" ,.stage = hlsl::ESS_VERTEX,.entries=&specConstants},
-					fragShader
-				};
+        IGPUPipelineBase::SShaderEntryMap specConstants;
+				specConstants[0] = std::span{ reinterpret_cast<const uint8_t*>(&orientationAsUint32), sizeof(orientationAsUint32)};
 
 				IGPUGraphicsPipeline::SCreationParams params[1];
 				params[0].layout = layout;
-				params[0].shaders = shaders;
+				params[0].vertexShader = { .shader = m_vxShader.get(), .entryPoint = "main", .entries = &specConstants };
+				params[0].fragmentShader = fragShader;
 				params[0].cached = {
 					.vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state
 					.primitiveAssembly = {},

From 03f7bc7548fb97f5a7dd9c950997f14fb13521e1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:30:31 +0700
Subject: [PATCH 280/346] Fix descriptor set casting for const counterpart

---
 include/nbl/asset/ICPUDescriptorSet.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 2498a438ca..c7f54360ac 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -92,6 +92,13 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
       requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSet>)
     static auto computeDependantsImpl(Self* self) {
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
+
+        using cpu_buffer_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUBuffer*, ICPUBuffer*>;
+        using cpu_sampler_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUSampler*, ICPUSampler*>;
+        using cpu_image_view_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUImageView*, ICPUImageView*>;
+        using cpu_buffer_view_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUBufferView*, ICPUBufferView*>;
+        using cpu_tlas_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUTopLevelAccelerationStructure*, ICPUTopLevelAccelerationStructure*>;
+
         core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get() };
         for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
         {
@@ -104,15 +111,15 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
             switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
             {
             case IDescriptor::EC_BUFFER:
-              dependants.insert(static_cast<ICPUBuffer*>(desc));
+              dependants.insert(static_cast<cpu_buffer_ptr_t>(desc));
             case IDescriptor::EC_SAMPLER:
-              dependants.insert(static_cast<ICPUSampler*>(desc));
+              dependants.insert(static_cast<cpu_sampler_ptr_t>(desc));
             case IDescriptor::EC_IMAGE:
-              dependants.insert(static_cast<ICPUImageView*>(desc));
+              dependants.insert(static_cast<cpu_image_view_ptr_t>(desc));
             case IDescriptor::EC_BUFFER_VIEW:
-              dependants.insert(static_cast<ICPUBufferView*>(desc));
+              dependants.insert(static_cast<cpu_buffer_view_ptr_t>(desc));
             case IDescriptor::EC_ACCELERATION_STRUCTURE:
-              dependants.insert(static_cast<ICPUTopLevelAccelerationStructure*>(desc));
+              dependants.insert(static_cast<cpu_tlas_ptr_t>(desc));
             default:
               break;
             }

From aeebe3679492232275ee7dd28255a336d5af0006 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:31:27 +0700
Subject: [PATCH 281/346] Fix entries traversal in gpu pipeline

---
 include/nbl/video/IGPUPipeline.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 0b56b87ee9..5a160fb2b2 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -61,11 +61,14 @@ class IGPUPipelineBase {
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
                   
                 int64_t specData = 0;
-                for (const auto& entry : *entries)
+                if (entries)
                 {
-                  if (!entry.second.size())
-                      return INVALID_SPEC_INFO;
-                  specData += entry.second.size();
+                    for (const auto& entry : *entries)
+                    {
+                      if (!entry.second.size())
+                          return INVALID_SPEC_INFO;
+                      specData += entry.second.size();
+                    }
                 }
                 if (specData>0x7fffffff)
                     return INVALID_SPEC_INFO;

From b65f14fad0ab3405d43fa5b8da313e50a5cb807e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:31:44 +0700
Subject: [PATCH 282/346] move SHitGroup to outside SCreationParams

---
 include/nbl/video/IGPURayTracingPipeline.h | 13 +++++++------
 src/nbl/video/ILogicalDevice.cpp           |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 3bcd4537f3..4b92db329b 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -15,18 +15,19 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
         using pipeline_t = asset::IRayTracingPipeline<const IGPUPipelineLayout>;
 
     public:
+        struct SHitGroup
+        {
+            SShaderSpecInfo closestHit;
+            SShaderSpecInfo anyHit;
+            SShaderSpecInfo intersection;
+        };
+
         struct SCreationParams : public SPipelineCreationParams<const IGPURayTracingPipeline>
         {
             using FLAGS = pipeline_t::FLAGS;
 
             struct SShaderGroupsParams
             {
-                struct SHitGroup
-                {
-                    SShaderSpecInfo closestHit;
-                    SShaderSpecInfo anyHit;
-                    SShaderSpecInfo intersection;
-                };
 
                 SShaderSpecInfo raygen;
                 std::span<SShaderSpecInfo> misses;
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 0056cc3a2a..cbfee667cf 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -1094,7 +1094,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
 
     core::vector<IGPUPipelineBase::SShaderSpecInfo> debloatedMissSpecs(missGroupCount);
     auto debloatedMissSpecData = debloatedMissSpecs.data();
-    core::vector<IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams::SHitGroup> debloatedHitSpecs(hitGroupCount);
+    core::vector<IGPURayTracingPipeline::SHitGroup> debloatedHitSpecs(hitGroupCount);
     auto debloatedHitSpecData = debloatedHitSpecs.data();
     core::vector<IGPUPipelineBase::SShaderSpecInfo> debloatedCallableSpecs(callableGroupCount);
     auto debloatedCallableSpecData = debloatedCallableSpecs.data();

From c5f947947b1d0c07f244fe8ab5f941e176b548bf Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:32:33 +0700
Subject: [PATCH 283/346] Fix ray tracing pipeline creation

---
 src/nbl/video/CVulkanLogicalDevice.cpp | 33 ++++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 24f5ae60b2..89f7ab1da3 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1473,7 +1473,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
 )
 {
     using SShaderGroupParams = IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams;
-    using SHitShaderGroup = SShaderGroupParams::SHitGroup;
+    using SHitShaderGroup = IGPURayTracingPipeline::SHitGroup;
 
     const auto dynamicStates = std::array{ VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR };
     const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { 
@@ -1518,7 +1518,10 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
     {
         core::unordered_map<const asset::IShader*, uint32_t> shaderIndexes;
         auto getVkShaderIndex = [&](const asset::IShader* shader)
-          { return shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader];  };
+        {
+          const auto index = shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader];
+          return index;
+        };
 
         auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](IGPUPipelineBase::SShaderSpecInfo spec) -> VkRayTracingShaderGroupCreateInfoKHR
         {
@@ -1553,23 +1556,39 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
             if (!spec.shader) return;
             if (shaderIndexes.find(spec.shader) == shaderIndexes.end())
             {
-                shaderIndexes.insert({ spec.shader, static_cast<uint32_t>(std::distance(outShaderStage, vk_shaderStage.data()))});
-                *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
+                shaderIndexes.insert({ spec.shader, std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage)});
+                *(outShaderStage) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
+                outShaderStage++;
             }
         };
-        processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN);
-        outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages,outShaderStage);
-        assert(outCreateInfo->stageCount != 0);
 
         const auto& shaderGroups = info.shaderGroups;
         outCreateInfo->pGroups = outShaderGroup;
+        processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN);
         *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroups.raygen);
+
         for (const auto& shaderGroup : shaderGroups.misses)
+        {
+            processSpecInfo(shaderGroup, hlsl::ESS_MISS);
             *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup);
+        }
+
         for (const auto& shaderGroup : shaderGroups.hits)
+        {
+            processSpecInfo(shaderGroup.closestHit, hlsl::ESS_CLOSEST_HIT);
+            processSpecInfo(shaderGroup.anyHit, hlsl::ESS_ANY_HIT);
+            processSpecInfo(shaderGroup.intersection, hlsl::ESS_INTERSECTION);
             *(outShaderGroup++) = getHitVkRayTracingShaderGroupCreateInfo(shaderGroup);
+        }
+
         for (const auto& shaderGroup : shaderGroups.callables)
+        {
+            processSpecInfo(shaderGroup, hlsl::ESS_CALLABLE);
             *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup);
+        }
+
+        outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages,outShaderStage);
+        assert(outCreateInfo->stageCount != 0);
         outCreateInfo->groupCount = 1 + shaderGroups.hits.size() + shaderGroups.misses.size() + shaderGroups.callables.size();
         outCreateInfo->maxPipelineRayRecursionDepth = info.cached.maxRecursionDepth;
         if (info.cached.dynamicStackSize)

From edefa6724f4ffa6d6c2adf9c968c8d51691358b2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:32:51 +0700
Subject: [PATCH 284/346] Fix imgui pass

---
 src/nbl/ext/ImGui/ImGui.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp
index b40c7155be..f477e96cdf 100644
--- a/src/nbl/ext/ImGui/ImGui.cpp
+++ b/src/nbl/ext/ImGui/ImGui.cpp
@@ -342,17 +342,13 @@ core::smart_refctd_ptr<video::IGPUGraphicsPipeline> UI::createPipeline(SCreation
 
 	core::smart_refctd_ptr<video::IGPUGraphicsPipeline> pipeline;
 	{
-		const IPipelineBase::SShaderSpecInfo specs[] =
-		{
-			{.shader = shaders.vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX},
-			{.shader = shaders.fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT}
-		};
 
 		IGPUGraphicsPipeline::SCreationParams params[1];
 		{
 			auto& param = params[0u];
+			param.vertexShader = { .shader = shaders.vertex.get(), .entryPoint = "VSMain" };
+			param.fragmentShader = { .shader = shaders.fragment.get(), .entryPoint = "PSMain" };
 			param.layout = pipelineLayout.get();
-			param.shaders = specs;
 			param.renderpass = creationParams.renderpass.get();
 			param.cached = { .vertexInput = vertexInputParams, .primitiveAssembly = primitiveAssemblyParams, .rasterization = rasterizationParams, .blend = blendParams, .subpassIx = creationParams.subpassIx };
 		};

From 67c4d8dac7c82c45d2a8575361ec21e6708f2d3e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:33:09 +0700
Subject: [PATCH 285/346] Add assert shader stage

---
 src/nbl/video/utilities/CAssetConverter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 41f57e3b11..147a76bdd4 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1040,6 +1040,7 @@ class HashVisit : public CAssetConverter::CHashCache::hash_impl_base
 				{
 					const auto stage = std::get<1>(argTuple);
 					hasher << arg0.entryPoint;
+					assert(hlsl::bitCount(stage) == 1);
 					hasher << stage;
 					hasher << arg0.requiredSubgroupSize;
 					if (!arg0.entries.empty())

From bc9aed64c6c32414d86101165bf828f39b00a5ed Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:36:14 +0700
Subject: [PATCH 286/346] use core::bitflag::hasFlags instead of &

---
 include/nbl/video/IGPURayTracingPipeline.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 4b92db329b..7151f8f227 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -109,7 +109,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                     }
 
                     // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
-                    if (flags & FLAGS::NO_NULL_ANY_HIT_SHADERS && !shaderGroup.anyHit.shader)
+                    if (flags.hasFlags(FLAGS::NO_NULL_ANY_HIT_SHADERS) && !shaderGroup.anyHit.shader)
                         return {};
 
                     if (shaderGroup.anyHit.shader) 
@@ -119,7 +119,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                     }
 
                     // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
-                    if (flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS && !shaderGroup.intersection.shader)
+                    if (flags.hasFlags(FLAGS::NO_NULL_CLOSEST_HIT_SHADERS) && !shaderGroup.intersection.shader)
                         return {};
                 }
 

From 294a3997f6c7bd9efe1d0ec3f9664e522d322595 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 14:39:51 +0700
Subject: [PATCH 287/346] Add whether shader is null when cloning
 SShaderSpecInfo

---
 include/nbl/asset/ICPUPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 0642acb676..8b9fec34c4 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -85,7 +85,7 @@ class ICPUPipelineBase
             SShaderSpecInfo clone(uint32_t depth) const
             {
                 auto newSpecInfo = *this;
-                if (depth > 0u)
+                if (newSpecInfo.shader.get() != nullptr && depth > 0u)
                 {
                     newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(this->shader->clone(depth - 1u));
                 }

From 35815d2a8d25b3d2a5e3281f6b2a411d8b8c31f6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 15:23:18 +0700
Subject: [PATCH 288/346] Small improvement on ILogicalDevice

---
 src/nbl/video/ILogicalDevice.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index cbfee667cf..975151ddbd 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -29,7 +29,9 @@ class SpirvDebloatTask
         IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders)
         {
             const auto* shader = shaderSpec.shader;
-            const auto& entryPoints = m_entryPointsMap[shader];
+            const auto findResult = m_entryPointsMap.find(shader);
+            assert(findResult != m_entryPointsMap.end());
+            const auto& entryPoints = findResult->second;
 
             auto debloatedShaderSpec = shaderSpec;
             if (shader != nullptr)
@@ -1128,14 +1130,14 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
         newParams[ix] = param;
         newParams[ix].shaderGroups.raygen = debloatTask.debloat(param.shaderGroups.raygen, debloatedShaders);
 
-        newParams[ix].shaderGroups.misses = { debloatedMissSpecData, param.shaderGroups.misses.size() };
+        newParams[ix].shaderGroups.misses = debloatedMissSpecs;
         for (const auto& miss: param.shaderGroups.misses)
         {
             *debloatedMissSpecData = debloatTask.debloat(miss, debloatedShaders);
             debloatedMissSpecData++;
         }
 
-        newParams[ix].shaderGroups.hits = { debloatedHitSpecData, param.shaderGroups.hits.size() };
+        newParams[ix].shaderGroups.hits = debloatedHitSpecs;
         for (const auto& hit: param.shaderGroups.hits)
         {
             *debloatedHitSpecData = {
@@ -1146,7 +1148,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
             debloatedHitSpecData++;
         }
 
-        newParams[ix].shaderGroups.callables = { debloatedCallableSpecData, param.shaderGroups.callables.size() };
+        newParams[ix].shaderGroups.callables = debloatedCallableSpecs;
         for (const auto& callable: param.shaderGroups.callables)
         {
             *debloatedCallableSpecData = debloatTask.debloat(callable, debloatedShaders);

From 66c87a0040ca652ea0bf3cce7825c08b2cc2f0b0 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 15:24:11 +0700
Subject: [PATCH 289/346] Small improvement on SShaderSpecInfo::create

---
 include/nbl/video/IGPUPipeline.h            |  7 ++++---
 src/nbl/video/utilities/CAssetConverter.cpp | 12 ++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 5a160fb2b2..96ee843296 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -109,17 +109,18 @@ class IGPUPipelineBase {
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
 
 
-            static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t& outEntries)  
+            static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t* outEntries)  
             {
                 SShaderSpecInfo specInfo;
                 specInfo.shader = cpuSpecInfo.shader.get();
                 specInfo.entryPoint = cpuSpecInfo.entryPoint;
                 specInfo.requiredSubgroupSize = cpuSpecInfo.requiredSubgroupSize;
+                outEntries->clear();
                 for (const auto&[key, value] : cpuSpecInfo.entries)
                 {
-                    outEntries.insert({ key, { value.data(), value.size() } });
+                    outEntries->insert({ key, { value.data(), value.size() } });
                 }
-                specInfo.entries = &outEntries;
+                specInfo.entries = outEntries;
                 return specInfo;
             };
         };
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index 147a76bdd4..b53dc54262 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -3203,7 +3203,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							IGPUComputePipeline::SCreationParams params = {};
 							params.layout = visitor.layout;
 							// while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to COMPUTE
-							params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), entryMap);
+							params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), &entryMap);
 							device->createComputePipelines(inputs.pipelineCache,{&params,1},&ppln);
 						}
 						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
@@ -3268,11 +3268,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 										tmpSpecInfo.push_back(std::move(info));
 								}
                 using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo;
-								params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), vertexEntryMap);
-								params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), tesselationControlEntryMap);
-								params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), tesselationEvaluationEntryMap);
-								params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), geometryEntryMap);
-								params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), fragmentEntryMap);
+								params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), &vertexEntryMap);
+								params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), &tesselationControlEntryMap);
+								params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), &tesselationEvaluationEntryMap);
+								params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), &geometryEntryMap);
+								params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), &fragmentEntryMap);
 							}
 							params.cached = asset->getCachedCreationParams();
 							device->createGraphicsPipelines(inputs.pipelineCache,{&params,1},&ppln);

From 01dced9a007c5e4592aa65d77af9c6c14aab5b72 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 9 Jun 2025 15:24:36 +0700
Subject: [PATCH 290/346] Skip null node

---
 include/nbl/asset/IPreHashed.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h
index 86e1841f61..94fb9a7d2d 100644
--- a/include/nbl/asset/IPreHashed.h
+++ b/include/nbl/asset/IPreHashed.h
@@ -46,8 +46,6 @@ class IPreHashed : public IAsset
 			core::unordered_set<IAsset*> alreadyDescended; // whether we have push the children to the stack
 			auto push = [&stack,&alreadyVisited](IAsset* node) -> void
 			{
-				if (!node)
-					return;
 				const auto [dummy,inserted] = alreadyVisited.insert(node);
 				if (inserted)
 					stack.push(node);

From ce77b462813cb4bb18ef26d6c02027514536e55a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Jun 2025 16:52:17 +0700
Subject: [PATCH 291/346] uncomment some concept requires

---
 include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 0177863b11..e11e238130 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -91,7 +91,7 @@ struct ArithmeticConfiguration
     // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
     // specify the next level to store values for in template param
     // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
-    template<uint16_t level>// NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
     {
         uint16_t nextLevelInvocationCount;
@@ -106,7 +106,7 @@ struct ArithmeticConfiguration
             return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * (nextLevelInvocationCount+__padding) + (virtualSubgroupID/ItemsPerInvocation_1) + virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1);
     }
 
-    template<uint16_t level>// NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
     {
         const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
@@ -114,7 +114,7 @@ struct ArithmeticConfiguration
     }
 
     // get the coalesced index in shared mem at the current level
-    template<uint16_t level>// NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
         uint16_t levelInvocationCount;

From 2440dec99a4ad4101ed61691f550dfa965d551bc Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 14:06:36 +0700
Subject: [PATCH 292/346] Add renderpass to constructor parameter of cpu
 graphics pipeline

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index a17bebe87d..8e338020ab 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -20,9 +20,9 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 
     public:
         
-        static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(ICPUPipelineLayout* layout)
+        static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(ICPUPipelineLayout* layout, ICPURenderpass* renderpass = nullptr)
         {
-            auto retval = new ICPUGraphicsPipeline(layout);
+            auto retval = new ICPUGraphicsPipeline(layout, renderpass);
             return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
         }
 
@@ -79,8 +79,8 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
 
     private:
-        explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout)
-            : base_t(layout, {}, {})
+        explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout, ICPURenderpass* renderpass)
+            : base_t(layout, {}, renderpass)
             {}
 
         static inline int8_t stageToIndex(const hlsl::ShaderStage stage)
@@ -110,9 +110,8 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 
         inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
-            auto* newPipeline = new ICPUGraphicsPipeline(layout.get());
+            auto* newPipeline = new ICPUGraphicsPipeline(layout.get(), m_renderpass.get());
             newPipeline->m_params = m_params;
-            newPipeline->m_renderpass = m_renderpass;
             
             for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
             {

From ccecd470a645b56f4b7296bc4dd1115f259c5c87 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 14:07:11 +0700
Subject: [PATCH 293/346] Fix overload error

---
 include/nbl/asset/ICPUComputePipeline.h     | 17 ++++++++++++-
 include/nbl/asset/ICPUGraphicsPipeline.h    | 28 ++++++++++++++++++++-
 include/nbl/asset/ICPUPipeline.h            |  6 +++--
 include/nbl/asset/ICPURayTracingPipeline.h  | 17 ++++++++++++-
 src/nbl/asset/utils/CSPIRVIntrospector.cpp  |  2 +-
 src/nbl/video/utilities/CAssetConverter.cpp | 15 +++--------
 src/nbl/video/utilities/CComputeBlit.cpp    |  4 +--
 7 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 69bffe2bba..cc05e6c762 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -15,6 +15,7 @@ namespace nbl::asset
 //! CPU Version of Compute Pipeline
 class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipelineLayout>>
 {
+        using pipeline_base_t = IComputePipeline<ICPUPipelineLayout>;
         using base_t = ICPUPipeline<IComputePipeline<ICPUPipelineLayout>>;
 
     public:
@@ -46,6 +47,11 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
             return {};
         }
 
+        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
         inline SShaderSpecInfo& getSpecInfo()
         {
             return m_specInfo;
@@ -56,7 +62,16 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
             return m_specInfo;
         }
 
-        inline SCachedCreationParams& getCachedCreationParamsMut() { return m_params; }
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
+        inline SCachedCreationParams& getCachedCreationParams()
+        {
+            assert(isMutable());
+            return m_params;
+        }
 
         inline bool valid() const override
         {
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 8e338020ab..eb4bc0d961 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -39,7 +39,12 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return computeDependantsImpl(this);
         }
 
-        inline SCachedCreationParams& getCachedCreationParamsMut()
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
+        inline SCachedCreationParams& getCachedCreationParams()
         {
             assert(isMutable());
             return m_params;
@@ -53,6 +58,27 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return {};
         }
 
+        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
+        SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage)
+        {
+            if (!isMutable()) return nullptr;
+            const auto stageIndex = stageToIndex(stage);
+            if (stageIndex != -1)
+                return &m_specInfos[stageIndex];
+            return nullptr;
+        }
+
+        const SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage) const
+        {
+            const auto stageIndex = stageToIndex(stage);
+            if (stageIndex != -1)
+                return &m_specInfos[stageIndex];
+            return nullptr;
+        }
 
         inline virtual bool valid() const override final
         {
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 8b9fec34c4..7003beeee7 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -132,10 +132,12 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
         }
 
         // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later.
-        inline std::span<SShaderSpecInfo> getSpecInfoMut(hlsl::ShaderStage stage)
+        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return {};
-            const auto specInfo = const_cast<const this_t*>(this)->getSpecInfos(stage);
+            const this_t* constPipeline = const_cast<const this_t*>(this);
+            const ICPUPipelineBase* basePipeline = constPipeline;
+            const auto specInfo = basePipeline->getSpecInfos(stage);
             return { const_cast<SShaderSpecInfo*>(specInfo.data()), specInfo.size() };
         }
 
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 1296d8359a..0c448b06b1 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -44,7 +44,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return computeDependantsImpl(this);
         }
 
-        inline virtual std::span<const SShaderSpecInfo> getSpecInfo(hlsl::ShaderStage stage) const override final
+        inline virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
         {
             switch (stage) 
             {
@@ -65,6 +65,11 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return {};
         }
 
+        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
         inline core::vector<SShaderSpecInfo>* getSpecInfoVec(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return nullptr;
@@ -95,6 +100,16 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return true;
         }
 
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
+        inline SCachedCreationParams& getCachedCreationParams() {
+            assert(isMutable());
+            return m_params;
+        }
+
     protected:
         virtual ~ICPURayTracingPipeline() = default;
 
diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
index 214ffdddbb..4ac78066a7 100644
--- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp
+++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
@@ -177,7 +177,7 @@ core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximat
     }
 
     auto pipeline = ICPUComputePipeline::create(layout.get());
-    pipeline->getSpecInfoMut(hlsl::ShaderStage::ESS_COMPUTE)[0] = info;
+    pipeline->getSpecInfos(hlsl::ShaderStage::ESS_COMPUTE)[0] = info;
     return pipeline;
 }
 
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index b53dc54262..d8ce147820 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -536,8 +536,8 @@ class AssetVisitor : public CRTP
 			using stage_t = hlsl::ShaderStage;
 			for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT})
 			{
-				const auto& specInfo = asset->getSpecInfos(stage);
-				const auto* shader = specInfo[0].shader.get();
+				const auto& specInfo = *asset->getSpecInfo(stage);
+				const auto* shader = specInfo.shader.get();
 				if (!shader)
 				{
 					if (stage==stage_t::ESS_VERTEX) // required
@@ -545,7 +545,7 @@ class AssetVisitor : public CRTP
 					CRTP::template nullOptional<IShader>();
 					continue;
 				}
-				if (!descend(shader,{shader},specInfo[0], stage))
+				if (!descend(shader,{shader}, specInfo, stage))
 					return false;
 			}
 			return true;
@@ -3226,8 +3226,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUGraphicsPipeline>)
 			{
-				core::vector<ICPUPipelineBase::SShaderSpecInfo> tmpSpecInfo;
-				tmpSpecInfo.reserve(5);
 				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUGraphicsPipeline* asset = entry.second.canonicalAsset;
@@ -3259,14 +3257,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								params.layout = visitor.layout;
 								params.renderpass = visitor.renderpass;
 								// while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to match the slot here
-								tmpSpecInfo.clear();
 								using stage_t = hlsl::ShaderStage;
-								for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT})
-								{
-									auto& info = visitor.getSpecInfo(stage);
-									if (info.shader)
-										tmpSpecInfo.push_back(std::move(info));
-								}
                 using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo;
 								params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), &vertexEntryMap);
 								params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), &tesselationControlEntryMap);
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index ade127b790..924c337cbe 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -78,12 +78,12 @@ struct ConstevalParameters
 		}
 
 		auto pipeline = ICPUComputePipeline::create(layout);
-		pipeline->getSpecInfoMut(ESS_COMPUTE)[0] = {
+		pipeline->getSpecInfo() = {
 			.shader = shader,
 			.entryPoint = "main",
 			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(findMSB(limits.maxSubgroupSize)),
 		};
-		pipeline->getCachedCreationParamsMut() = {
+		pipeline->getCachedCreationParams() = {
 			.requireFullSubgroups = true,
 		};
 		return pipeline;

From 99f2d49295810801e5f33d5bebb0a8de2a9e38b9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 14:07:40 +0700
Subject: [PATCH 294/346] Optimize SpirvDebloaterTask to use only one map

---
 src/nbl/video/ILogicalDevice.cpp | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 975151ddbd..75b36ce889 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -10,7 +10,12 @@ using namespace nbl::video;
 class SpirvDebloatTask
 {
     public:
-      using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
+        using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
+        struct ShaderInfo
+        {
+            EntryPoints entryPoints;
+            const asset::IShader* debloatedShaders;
+        };
 
         SpirvDebloatTask(asset::ISPIRVDebloater* debloater, system::logger_opt_ptr logger) : m_debloater(debloater), m_logger(logger)
         {
@@ -20,38 +25,37 @@ class SpirvDebloatTask
         void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage)
         {
             const auto* shader = shaderSpec.shader;
-            auto it = m_entryPointsMap.find(shader);
-            if (it == m_entryPointsMap.end() || it->first != shader)
-                it = m_entryPointsMap.emplace_hint(it, shader, EntryPoints());
-            it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage });
+            auto it = m_shaderInfoMap.find(shader);
+            if (it == m_shaderInfoMap.end() || it->first != shader)
+              it = m_shaderInfoMap.emplace_hint(it, shader, ShaderInfo{ EntryPoints(), nullptr } );
+            it->second.entryPoints.insert({ .name = shaderSpec.entryPoint, .stage = stage });
         }
 
         IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders)
         {
             const auto* shader = shaderSpec.shader;
-            const auto findResult = m_entryPointsMap.find(shader);
-            assert(findResult != m_entryPointsMap.end());
-            const auto& entryPoints = findResult->second;
+            auto findResult = m_shaderInfoMap.find(shader);
+            assert(findResult != m_shaderInfoMap.end());
+            const auto& entryPoints = findResult->second.entryPoints;
+            auto* debloatedShader = findResult->second.debloatedShaders;
 
             auto debloatedShaderSpec = shaderSpec;
             if (shader != nullptr)
             {
-                if (!m_debloatedShadersMap.contains(shader))
+                if (debloatedShader == nullptr)
                 {
                     const auto outShadersData = outShaders.data();
                     outShaders.push_back(m_debloater->debloat(shader, entryPoints, m_logger));
                     assert(outShadersData == outShaders.data());
-                    m_debloatedShadersMap.emplace(shader, outShaders.back().get());
+                    debloatedShader = outShaders.back().get();
                 }
-                const auto debloatedShader = m_debloatedShadersMap[shader];
                 debloatedShaderSpec.shader = debloatedShader;
             }
             return debloatedShaderSpec;
         }
   
     private:
-        core::map<const asset::IShader*, EntryPoints> m_entryPointsMap;
-        core::map<const asset::IShader*, const asset::IShader*> m_debloatedShadersMap;
+        core::map<const asset::IShader*, ShaderInfo> m_shaderInfoMap;
         asset::ISPIRVDebloater* m_debloater;
         const system::logger_opt_ptr m_logger;
 };

From efecb7efb59d57a0334bf52913d1f7fe0a73f367 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 14:07:55 +0700
Subject: [PATCH 295/346] Add inline to method in IGPUPipeline

---
 include/nbl/video/IGPUPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index 96ee843296..c22ad998db 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -75,7 +75,7 @@ class IGPUPipelineBase {
                 return static_cast<int32_t>(specData);
             }
 
-            bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const
+            inline bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const
             {
                 const auto dataSize = valid();
                 if (dataSize < 0)

From 1707b84bfd374a7ba43da9e0dc5a5fe8332194e4 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 14:46:03 +0700
Subject: [PATCH 296/346] Move required subgroups size stages checking to
 commonCreatePipelines

---
 include/nbl/video/IGPUComputePipeline.h    |  9 ++++++++
 include/nbl/video/IGPUGraphicsPipeline.h   | 17 +++++++++++++++
 include/nbl/video/IGPURayTracingPipeline.h | 24 ++++++++++++++++++++++
 include/nbl/video/ILogicalDevice.h         |  8 ++++++++
 src/nbl/video/ILogicalDevice.cpp           |  7 -------
 5 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 36813699c0..4c7bac1e6a 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -62,6 +62,15 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IG
                 return retval;
             }
 
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+                if (shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4)
+                {
+                    return hlsl::ESS_COMPUTE;
+                }
+                return {};
+            }
+
             IGPUPipelineLayout* layout = nullptr;
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 806ee337c3..dd2e587ee4 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -67,6 +67,23 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 return retval;
             }
 
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+                core::bitflag<hlsl::ShaderStage> stages;
+                auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
+                {
+                    if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                      stages |= stage;
+                    }
+                };
+                processSpecInfo(vertexShader, hlsl::ESS_VERTEX);
+                processSpecInfo(tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL);
+                processSpecInfo(tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION);
+                processSpecInfo(geometryShader, hlsl::ESS_GEOMETRY);
+                processSpecInfo(fragmentShader, hlsl::ESS_FRAGMENT);
+                return stages;
+            }
+
             IGPUPipelineLayout* layout = nullptr;
             SShaderSpecInfo vertexShader;
             SShaderSpecInfo tesselationControlShader;
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 7151f8f227..90060ab883 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -143,6 +143,30 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
 
                 return retval;
             }
+
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+                core::bitflag<hlsl::ShaderStage> stages;
+                auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
+                {
+                    if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                      stages |= stage;
+                    }
+                };
+                processSpecInfo(shaderGroups.raygen, hlsl::ESS_RAYGEN);
+                for (const auto& miss : shaderGroups.misses)
+                    processSpecInfo(miss, hlsl::ESS_MISS);
+                for (const auto& hit : shaderGroups.hits)
+                {
+                    processSpecInfo(hit.closestHit, hlsl::ESS_CLOSEST_HIT);
+                    processSpecInfo(hit.anyHit, hlsl::ESS_ANY_HIT);
+                    processSpecInfo(hit.intersection, hlsl::ESS_INTERSECTION);
+                }
+                for (const auto& callable : shaderGroups.callables)
+                    processSpecInfo(callable, hlsl::ESS_CALLABLE);
+                return stages;
+            }
+
         };
 
         struct SShaderGroupHandle
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 5976d06eb0..3f4dfa0f05 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -1259,6 +1259,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     return {};
                 }
 
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
+                const auto requiredSubgroupSizeStages = getPhysicalDeviceLimits().requiredSubgroupSizeStages;
+                if (!requiredSubgroupSizeStages.hasFlags(ci.getRequiredSubgroupStages()))
+                {
+                    NBL_LOG_ERROR("Invalid shader stage");
+                    return {};
+                }
+
                 retval += validation;
             }
             return retval;
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 75b36ce889..52ca3a55bd 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -812,13 +812,6 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     {
         const auto& ci = params[ix];
 
-        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
-        if (ci.shader.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE))
-        {
-            NBL_LOG_ERROR("Invalid shader stage");
-            return false;
-        }
-
         const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} };
         debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger));
         auto debloatedShaderSpec = ci.shader;

From 165eebc56ca14b4afcac3b9180d67112919038fa Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 20:34:41 +0700
Subject: [PATCH 297/346] Implement visitDependents

---
 include/nbl/asset/IAsset.h                    | 21 ++++++++++++++
 include/nbl/asset/ICPUAccelerationStructure.h |  9 ++++++
 include/nbl/asset/ICPUAnimationLibrary.h      |  7 +++++
 include/nbl/asset/ICPUBuffer.h                |  2 ++
 include/nbl/asset/ICPUBufferView.h            |  6 ++++
 include/nbl/asset/ICPUComputePipeline.h       |  6 ++++
 include/nbl/asset/ICPUDescriptorSet.h         | 29 +++++++++++++++++++
 include/nbl/asset/ICPUDescriptorSetLayout.h   |  6 ++++
 include/nbl/asset/ICPUGraphicsPipeline.h      |  8 +++++
 include/nbl/asset/ICPUImage.h                 |  4 +++
 include/nbl/asset/ICPUImageView.h             |  5 ++++
 include/nbl/asset/ICPUMesh.h                  |  4 +++
 include/nbl/asset/ICPUMeshBuffer.h            |  3 ++
 include/nbl/asset/ICPUPipelineCache.h         |  8 +++--
 include/nbl/asset/ICPUPipelineLayout.h        |  9 ++++++
 include/nbl/asset/ICPURayTracingPipeline.h    | 17 +++++++++++
 include/nbl/asset/ICPURenderpass.h            |  6 ++++
 .../asset/ICPURenderpassIndependentPipeline.h |  6 ++++
 include/nbl/asset/ICPUSampler.h               |  6 ++++
 include/nbl/asset/ICPUSkeleton.h              |  6 ++++
 include/nbl/asset/IShader.h                   |  5 ++++
 21 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index 0e91b99c36..cc105f2633 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -156,6 +156,25 @@ class IAsset : virtual public core::IReferenceCounted
 		//!
 		inline bool isMutable() const {return m_mutable;}
 
+		inline void visitDependents(std::function<bool(const IAsset*)> visit) const
+		{
+				visitDependentsImpl([&visit](const IAsset* dep)->bool
+        {
+            if (dep)
+                return visit(dep);
+            return true;
+        });
+		}
+
+    inline void visitDependents(std::function<bool(IAsset*)> visit)
+    {
+        assert(isMutable());
+				visitDependents([&](const IAsset* dependent) -> bool
+				{
+						return visit(const_cast<IAsset*>(dependent));
+				});
+    }
+
 		virtual core::unordered_set<const IAsset*> computeDependants() const = 0;
 
 		virtual core::unordered_set<IAsset*> computeDependants() = 0;
@@ -174,6 +193,8 @@ class IAsset : virtual public core::IReferenceCounted
 	private:
 		friend IAssetManager;
 		bool m_mutable = true;
+
+		virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const = 0;
 };
 
 template<typename T>
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 4e194867e6..8d02b3ac8b 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -135,6 +135,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 			return cp;
 		}
 
+
 		// Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents
 		inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
@@ -257,6 +258,8 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		core::smart_refctd_dynamic_array<AABBs<ICPUBuffer>> m_AABBGeoms = nullptr;
 		core::smart_refctd_dynamic_array<uint32_t> m_geometryPrimitiveCount = nullptr;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT;
+
+		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override {}
 };
 
 class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure
@@ -386,6 +389,12 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
         for (const auto& instance : *self->m_instances)
           dependants.insert(instance.getBase().blas.get());
         return dependants;
+    }
+
+		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+        for (const auto& instance : *m_instances)
+            if (!visit(instance.getBase().blas.get())) return;
     }
 };
 
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 1663447b73..33b5b182c9 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -113,6 +113,13 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
     static auto computeDependantsImpl(Self* self) {
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
         return core::unordered_set<asset_ptr_t>{ self->m_keyframeStorageBinding.buffer.get(), self->m_timestampStorageBinding.buffer.get(), self->m_animationStorageRange.buffer.get() };
+    }
+
+		virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+        if (!visit(m_keyframeStorageBinding.buffer.get())) return;
+        if (!visit(m_timestampStorageBinding.buffer.get())) return;
+        if (!visit(m_animationStorageRange.buffer.get())) return;
     }
 };
 
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 0ad1d7bf48..94f1dc750a 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -139,6 +139,8 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         discardContent_impl();
     }
 
+    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override {}
+
     void* m_data;
     core::smart_refctd_ptr<core::refctd_memory_resource> m_mem_resource;
     size_t m_alignment;
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index 55d50356c1..1741a1f445 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -28,6 +28,7 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 		constexpr static inline auto AssetType = ET_BUFFER_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
+
     inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
 			return computeDependantsImpl(this);
@@ -66,6 +67,11 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
         return core::unordered_set<asset_ptr_t>{ self->m_buffer.get() };
     }
+
+		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		{
+        if (!visit(m_buffer.get())) return;
+		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index cc05e6c762..5dbec00ea4 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -105,6 +105,12 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
             using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
             return core::unordered_set<asset_ptr_t>{ self->m_layout.get(), self->m_specInfo.shader.get() };
         }
+        
+        virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        {
+            if (!visit(m_layout.get())) return;
+            if (!visit(m_specInfo.shader.get())) return;
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index c7f54360ac..05a7f51f60 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -127,6 +127,35 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
         }
         return dependants;
     }
+
+    virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+        for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
+        {
+          if (!m_descriptorInfos[i]) continue;
+          const auto size = m_descriptorInfos[i]->size();
+          for (auto desc_i = 0u; desc_i < size; desc_i++)
+          {
+            auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get();
+            if (!desc) continue;
+            switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+            {
+            case IDescriptor::EC_BUFFER:
+              if (!visit(static_cast<const ICPUBuffer*>(desc))) return;
+            case IDescriptor::EC_SAMPLER:
+              if (!visit(static_cast<const ICPUSampler*>(desc))) return;
+            case IDescriptor::EC_IMAGE:
+              if (!visit(static_cast<const ICPUImageView*>(desc))) return;
+            case IDescriptor::EC_BUFFER_VIEW:
+              if (!visit(static_cast<ICPUBufferView*>(desc))) return;
+            case IDescriptor::EC_ACCELERATION_STRUCTURE:
+              if (!visit(static_cast<ICPUTopLevelAccelerationStructure*>(desc))) return;
+            default:
+              break;
+            }
+          }
+        }
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index aea1520b6f..8dce4d9db4 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -85,6 +85,12 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
           return dependants;
       }
 
+      inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+      {
+          if (m_immutableSamplers) return;
+          for (const auto& sampler : *m_immutableSamplers)
+              if (!visit(sampler.get())) return;
+      }
 };
 
 }
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index eb4bc0d961..470c5d813b 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -146,6 +146,14 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 
             return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
+
+        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        {
+            if (!visit(m_layout.get())) return;
+            if (!visit(m_renderpass.get())) return;
+            for (const auto& info : m_specInfos)
+              if (!visit(info.shader.get())) return;
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index b732e50492..f13d75b76a 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -227,6 +227,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 				return _a.imageSubresource.mipLevel < _b.imageSubresource.mipLevel;
 			}
 		};
+
+    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 } // end namespace nbl::asset
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 9639df6eb9..74cb143fe6 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -82,6 +82,11 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
         return core::unordered_set<asset_ptr_t>{ self->params.image.get() };
     }
+
+    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+        if (!visit(params.image.get())) return;
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index e9aaf53ba4..f52db5055e 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -96,6 +96,10 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
 
 	private:
 		core::vector<core::smart_refctd_ptr<ICPUMeshBuffer>> m_meshBuffers;
+
+        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        {
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index c44d055c18..9872cc6b10 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -622,6 +622,9 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             return {};
         }
 
+        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        {
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 0ff912603d..85ac650a22 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -60,12 +60,12 @@ class ICPUPipelineCache final : public IPreHashed
 			return core::make_smart_refctd_ptr<ICPUPipelineCache>(std::move(cache_cp));
 		}
 
-	  inline core::unordered_set<const IAsset*> computeDependants() const override
+		inline core::unordered_set<const IAsset*> computeDependants() const override
 		{
 			return {};
 		}
 
-	  inline core::unordered_set<IAsset*> computeDependants() override
+		inline core::unordered_set<IAsset*> computeDependants() override
 		{
 			return {};
 		}
@@ -102,6 +102,10 @@ class ICPUPipelineCache final : public IPreHashed
 
 	private:
 		entries_map_t m_cache;
+
+		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		{
+		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index 4b668c1472..c7d835faae 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -92,6 +92,15 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
           return dependants;
       }
 
+      inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+      {
+          for (auto i = 0; i < m_descSetLayouts.size(); i++)
+          {
+              if (m_descSetLayouts[i]) continue;
+              if (!visit(m_descSetLayouts[i].get())) return;
+          }
+      }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 0c448b06b1..09101c73ee 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -138,6 +138,23 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return dependants;
         }
 
+        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        {
+            core::unordered_set<const IAsset*> dependants;
+            const auto visitOnce = [&](const IAsset* dep) -> bool {
+                auto [iter, inserted] = dependants.insert(dep);
+                if (inserted) return visit(dep);
+                return true;
+            };
+            visitOnce(m_raygen.shader.get());
+            for (const auto& missInfo : self->m_misses) visitOnce(missInfo.shader.get());
+            for (const auto& anyHitInfo : self->m_hitGroups.anyHits) visitOnce(anyHitInfo.shader.get());
+            for (const auto& closestHitInfo : self->m_hitGroups.closestHits) visitOnce(closestHitInfo.shader.get());
+            for (const auto& intersectionInfo : self->m_hitGroups.intersections) visitOnce(intersectionInfo.shader.get());
+            for (const auto& callableInfo : self->m_callables) visitOnce(callableInfo.shader.get());
+
+        }
+
         inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto newPipeline = new ICPURayTracingPipeline(layout.get());
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index 9cc73af881..517ffbe766 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -52,6 +52,12 @@ class ICPURenderpass : public IRenderpass, public IAsset
         inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {}
         inline ~ICPURenderpass() = default;
 
+    private:
+
+        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        {
+        }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index 83536e0c54..feb04cd1c4 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -157,6 +157,12 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 		std::array<std::unique_ptr<IPipelineBase::SShaderSpecInfo::spec_constant_map_t>,GRAPHICS_SHADER_STAGE_COUNT> m_entries = {};
 		std::array<IPipelineBase::SShaderSpecInfo,GRAPHICS_SHADER_STAGE_COUNT> m_infos = {};
 #endif
+
+  private:
+
+    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index ed11e7695d..d2ef756cad 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -78,6 +78,12 @@ class ICPUSampler : public ISampler, public IAsset
 		{
         return {};
 		}
+
+  private:
+
+    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index a29adbabbc..fb5c5953e0 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -96,6 +96,12 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
         return core::unordered_set<asset_ptr_t>{ self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() };
     }
+
+		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		{
+        if (!visit(m_defaultTransforms.buffer.get())) return;
+				if (!visit(m_parentJointIDs.buffer.get())) return;
+		}
 };
 
 }
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 59286e219d..4574ac073a 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -114,6 +114,11 @@ class IShader : public IAsset
         using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
         return core::unordered_set<asset_ptr_t>{self->m_code.get()};
     }
+
+    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    {
+        if (!visit(m_code.get())) return;
+    }
 };
 }
 

From d0a0245e7619aa1c2ae02672fc09683e0750db30 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 20:35:28 +0700
Subject: [PATCH 298/346] Check shader availability in
 getRequiredSubgroupStages

---
 include/nbl/video/IGPUComputePipeline.h    | 2 +-
 include/nbl/video/IGPUGraphicsPipeline.h   | 2 +-
 include/nbl/video/IGPURayTracingPipeline.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 4c7bac1e6a..1b6cbd69f2 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -64,7 +64,7 @@ class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IG
 
             inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
             {
-                if (shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4)
+                if (shader.shader && shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4)
                 {
                     return hlsl::ESS_COMPUTE;
                 }
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index dd2e587ee4..7d38ea677e 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -72,7 +72,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 core::bitflag<hlsl::ShaderStage> stages;
                 auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
                 {
-                    if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                    if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
                       stages |= stage;
                     }
                 };
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 90060ab883..6d77fc360e 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -149,7 +149,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                 core::bitflag<hlsl::ShaderStage> stages;
                 auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
                 {
-                    if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                    if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
                       stages |= stage;
                     }
                 };

From 2ad3e732084570ec915191b0662783523a72c86a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 20:52:38 +0700
Subject: [PATCH 299/346] Use visitDependents for discardDependantContents and
 anyDependantDiscardedContents

---
 include/nbl/asset/IPreHashed.h | 50 ++++++++++++++--------------------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h
index 94fb9a7d2d..054bfaee92 100644
--- a/include/nbl/asset/IPreHashed.h
+++ b/include/nbl/asset/IPreHashed.h
@@ -43,66 +43,56 @@ class IPreHashed : public IAsset
 		{
 			core::stack<IAsset*> stack;
 			core::unordered_set<IAsset*> alreadyVisited; // whether we have push the node to the stack
-			core::unordered_set<IAsset*> alreadyDescended; // whether we have push the children to the stack
-			auto push = [&stack,&alreadyVisited](IAsset* node) -> void
+			auto push = [&stack,&alreadyVisited](IAsset* node) -> bool
 			{
 				const auto [dummy,inserted] = alreadyVisited.insert(node);
 				if (inserted)
 					stack.push(node);
+				return true;
 			};
 			for (const auto& root : roots)
 				push(root);
 			while (!stack.empty())
 			{
 				auto* entry = stack.top();
-				const auto [dummy, inserted] = alreadyDescended.insert(entry);
-				if (inserted)
-				{
-          core::unordered_set<IAsset*> dependants = entry->computeDependants();
-					for (auto* dependant : dependants) push(dependant);
-				} else
-				{
-					// post order traversal does discard
-					auto* isPrehashed = dynamic_cast<IPreHashed*>(entry);
-					if (isPrehashed)
-						isPrehashed->discardContent();
-					stack.pop();
-				}
+				stack.pop();
+				entry->visitDependents(push);
+        // post order traversal does discard
+        auto* isPrehashed = dynamic_cast<IPreHashed*>(entry);
+        if (isPrehashed)
+          isPrehashed->discardContent();
 			}
 		}
 		static inline bool anyDependantDiscardedContents(const IAsset* root)
 		{
 			core::stack<const IAsset*> stack;
 			core::unordered_set<const IAsset*> alreadyVisited; // whether we have push the node to the stack
-			core::unordered_set<const IAsset*> alreadyDescended; // whether we have push the children to the stack
-			auto push = [&stack,&alreadyVisited](const IAsset* node) -> bool
+			bool result = false;
+			auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool
 			{
-				if (!node)
-					return false;
 				const auto [dummy,inserted] = alreadyVisited.insert(node);
 				if (inserted)
 				{
 					auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
 					if (isPrehashed && isPrehashed->missingContent())
-						return true;
+					{
+						stack = {};
+						result = true;
+						return false;
+					}
 					stack.push(node);
 				}
-				return false;
+				return true;
 			};
-			if (push(root))
+			if (!push(root))
 				return true;
 			while (!stack.empty())
 			{
 				auto* entry = stack.top();
-				const auto [dummy, inserted] = alreadyDescended.insert(entry);
-				if (inserted)
-				{
-          core::unordered_set<const IAsset*> dependants = entry->computeDependants();
-					for (auto* dependant : dependants) push(dependant);
-				} else
-					stack.pop();
+				stack.pop();
+				entry->visitDependents(push);
 			}
-			return false;
+			return result;
 		}
 
 	protected:

From 542bd0675a094eba3170188a50d05a85321e88bc Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 20:52:52 +0700
Subject: [PATCH 300/346] Fix debloat task

---
 src/nbl/video/ILogicalDevice.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 52ca3a55bd..19dc001d8f 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -37,7 +37,7 @@ class SpirvDebloatTask
             auto findResult = m_shaderInfoMap.find(shader);
             assert(findResult != m_shaderInfoMap.end());
             const auto& entryPoints = findResult->second.entryPoints;
-            auto* debloatedShader = findResult->second.debloatedShaders;
+            auto& debloatedShader = findResult->second.debloatedShaders;
 
             auto debloatedShaderSpec = shaderSpec;
             if (shader != nullptr)

From c9597ff77d5aa12ebdd31d27e242163b19f713b6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 20:59:18 +0700
Subject: [PATCH 301/346] Initialize stages to zero.

---
 include/nbl/video/IGPUGraphicsPipeline.h   | 2 +-
 include/nbl/video/IGPURayTracingPipeline.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 7d38ea677e..6b2201672b 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -69,7 +69,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
 
             inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
             {
-                core::bitflag<hlsl::ShaderStage> stages;
+                core::bitflag<hlsl::ShaderStage> stages = {};
                 auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
                 {
                     if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 6d77fc360e..482861dbcc 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -146,7 +146,7 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
 
             inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
             {
-                core::bitflag<hlsl::ShaderStage> stages;
+                core::bitflag<hlsl::ShaderStage> stages = {};
                 auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
                 {
                     if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {

From 046a334819f1b88454afb6cb7ab4de3b6d8906e9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 11 Jun 2025 21:02:23 +0700
Subject: [PATCH 302/346] More descriptive error

---
 include/nbl/video/ILogicalDevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 3f4dfa0f05..d8ef2bdef1 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -1263,7 +1263,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 const auto requiredSubgroupSizeStages = getPhysicalDeviceLimits().requiredSubgroupSizeStages;
                 if (!requiredSubgroupSizeStages.hasFlags(ci.getRequiredSubgroupStages()))
                 {
-                    NBL_LOG_ERROR("Invalid shader stage");
+                    NBL_LOG_ERROR("Shader stage is not a valid bit specified in requiredSubgroupSizeStages");
                     return {};
                 }
 

From fc1bc51846626a425ec697d53bbacc6273d11159 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 13 Jun 2025 14:08:25 +0700
Subject: [PATCH 303/346] removed redundant stuff, make config more readable

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 48 +++++++++++--------
 .../builtin/hlsl/workgroup2/shared_scan.hlsl  | 38 +++++++--------
 src/nbl/builtin/CMakeLists.txt                |  1 +
 3 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index e11e238130..419547bfd8 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -55,16 +55,22 @@ struct ArithmeticConfiguration
     static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
 
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
+    using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = tuple_element<0,ItemsPerInvocation>::type::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = tuple_element<1,ItemsPerInvocation>::type::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = tuple_element<2,ItemsPerInvocation>::type::value;
     static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
 
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value<LevelCount==3,uint16_t,
         mpl::max_v<uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize>,
         SubgroupSize*ItemsPerInvocation_1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3,uint16_t,SubgroupSize-1,0>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3,uint16_t,VirtualInvocationsAtLevel1+__padding,SubgroupSize>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize,0>::value;
+    using ChannelStride = tuple<integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >;
 
     // user specified the shared mem size of Scalars
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
@@ -74,7 +80,6 @@ struct ArithmeticConfiguration
             0
             >::value + LevelInputCount_1
         >::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3,uint16_t,SubgroupSize-1,0>::value;
 
     static bool electLast()
     {
@@ -94,16 +99,21 @@ struct ArithmeticConfiguration
     template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
     {
-        uint16_t nextLevelInvocationCount;
-        if (level == LevelCount-1)
-            nextLevelInvocationCount = SubgroupSize;
-        else
-            nextLevelInvocationCount = __SubgroupsPerVirtualWorkgroup;
+        const uint16_t ItemsPerNextInvocation = tuple_element<level,ItemsPerInvocation>::type::value;
+        const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u));
+        const uint16_t outInvocation = virtualSubgroupID/ItemsPerNextInvocation;
+        const uint16_t localOffset = outChannel * tuple_element<level,ChannelStride>::type::value + outInvocation;
 
         if (level==2)
-            return LevelInputCount_1 + ((SubgroupSize-uint16_t(1u))*ItemsPerInvocation_1) + (virtualSubgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * nextLevelInvocationCount + (virtualSubgroupID/ItemsPerInvocation_2);
+        {
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerNextInvocation;
+            return baseOffset + localOffset;
+        }
         else
-            return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * (nextLevelInvocationCount+__padding) + (virtualSubgroupID/ItemsPerInvocation_1) + virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1);
+        {
+            const uint16_t paddingOffset = virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1);
+            return localOffset + paddingOffset;
+        }
     }
 
     template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
@@ -117,16 +127,16 @@ struct ArithmeticConfiguration
     template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
-        uint16_t levelInvocationCount;
-        if (level == LevelCount-1)
-            levelInvocationCount = SubgroupSize;
-        else
-            levelInvocationCount = __SubgroupsPerVirtualWorkgroup;
+        const uint16_t localOffset = component * tuple_element<level,ChannelStride>::type::value + invocationIndex;
+        const uint16_t paddingOffset = invocationIndex/SubgroupSize;
 
         if (level==2)
-            return LevelInputCount_1 + ((SubgroupSize-uint16_t(1u))*ItemsPerInvocation_1) + component * levelInvocationCount + invocationIndex + invocationIndex/SubgroupSize;
+        {
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerInvocation_1;
+            return baseOffset + localOffset + paddingOffset;
+        }
         else
-            return component * (levelInvocationCount+__padding) + invocationIndex + invocationIndex/SubgroupSize;
+            return localOffset + paddingOffset;
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index 80dec1b85c..f8242f5ae1 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -84,12 +84,15 @@ struct reduce_level0
     using scalar_t = typename BinOp::type_t;
     using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
 
-    template<class DataAccessor, class ScratchAccessor, class Params>
+    template<class DataAccessor, class ScratchAccessor>
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 0 scan
-        subgroup2::reduction<Params> reduction0;
+        subgroup2::reduction<params_t> reduction0;
         [unroll]
         for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
         {
@@ -112,11 +115,14 @@ struct scan_level0
     using scalar_t = typename BinOp::type_t;
     using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
 
-    template<class DataAccessor, class ScratchAccessor, class Params>
+    template<class DataAccessor, class ScratchAccessor>
     static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
-        subgroup2::inclusive_scan<Params> inclusiveScan0;
+        subgroup2::inclusive_scan<params_t> inclusiveScan0;
         // level 0 scan
         [unroll]
         for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
@@ -147,11 +153,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
     scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
-        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -186,11 +191,10 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         BinOp binop;
 
-        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -216,11 +220,9 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
-            scalar_t left;
+            scalar_t left = BinOp::identity;
             if (idx != 0 || glsl::gl_SubgroupID() != 0)
                 scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
-            else
-                left = BinOp::identity;
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
@@ -253,12 +255,11 @@ struct reduce<Config, BinOp, 3, device_capabilities>
     scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -310,12 +311,11 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
     {
         using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
-        using params_lv0_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
         using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
         using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
         BinOp binop;
 
-        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor, params_lv0_t>(dataAccessor, scratchAccessor);
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
@@ -357,12 +357,10 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
                 scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
 
-            scalar_t lv2_scan;
+            scalar_t lv2_scan = BinOp::identity;
             const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
             if (glsl::gl_SubgroupID() != 0)
                 scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
-            else
-                lv2_scan = BinOp::identity;
 
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
@@ -378,11 +376,9 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
-            scalar_t left;
+            scalar_t left = BinOp::identity;
             if (idx != 0 || glsl::gl_SubgroupID() != 0)
                 scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
-            else
-                left = BinOp::identity;
             if (Exclusive)
             {
                 scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index a6405a3c99..d051c2153b 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -369,6 +369,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropi
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/generic_shared_data.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl")
 #tgmath

From 10b7f508f82f180f1260eb875291f153c7f96b4b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 13 Jun 2025 15:59:28 +0700
Subject: [PATCH 304/346] fix some bugs, readability fix

---
 .../builtin/hlsl/workgroup2/arithmetic_config.hlsl | 14 +++++++-------
 .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl   |  8 ++++++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 419547bfd8..f894eac58a 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -68,9 +68,9 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1;
 
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3,uint16_t,SubgroupSize-1,0>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3,uint16_t,VirtualInvocationsAtLevel1+__padding,SubgroupSize>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3,uint16_t,VirtualInvocationsAtLevel1,SubgroupSize>::value + __padding;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize,0>::value;
-    using ChannelStride = tuple<integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >;
+    using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
 
     // user specified the shared mem size of Scalars
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
@@ -101,17 +101,17 @@ struct ArithmeticConfiguration
     {
         const uint16_t ItemsPerNextInvocation = tuple_element<level,ItemsPerInvocation>::type::value;
         const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u));
-        const uint16_t outInvocation = virtualSubgroupID/ItemsPerNextInvocation;
+        const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation;
         const uint16_t localOffset = outChannel * tuple_element<level,ChannelStride>::type::value + outInvocation;
 
         if (level==2)
         {
-            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerNextInvocation;
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1;
             return baseOffset + localOffset;
         }
         else
         {
-            const uint16_t paddingOffset = virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1);
+            const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1);
             return localOffset + paddingOffset;
         }
     }
@@ -128,11 +128,11 @@ struct ArithmeticConfiguration
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
         const uint16_t localOffset = component * tuple_element<level,ChannelStride>::type::value + invocationIndex;
-        const uint16_t paddingOffset = invocationIndex/SubgroupSize;
+        const uint16_t paddingOffset = invocationIndex / SubgroupSize;
 
         if (level==2)
         {
-            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerInvocation_1;
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1;
             return baseOffset + localOffset + paddingOffset;
         }
         else
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
index f8242f5ae1..5b19c55fbd 100644
--- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -337,11 +337,15 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
         subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
         if (glsl::gl_SubgroupID() == 0)
         {
-            const uint16_t one = uint16_t(1u);
+            const uint16_t lastChannel = Config::ItemsPerInvocation_1 - uint16_t(1u);
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>((invocationIndex*Config::ItemsPerInvocation_2+i+one)*Config::SubgroupSize-one, Config::ItemsPerInvocation_1-one),lv2_val[i]);
+            {
+                const uint16_t inputSubgroupID = invocationIndex * Config::ItemsPerInvocation_2 + i;
+                const uint16_t inputSubgroupLastInvocation = inputSubgroupID * Config::SubgroupSize + (Config::SubgroupSize - uint16_t(1u));
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(inputSubgroupLastInvocation, lastChannel),lv2_val[i]);
+            }
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)

From 50281c67af6b113924eafda8b0cdb97d6af1c836 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 15:09:02 +0700
Subject: [PATCH 305/346] Remove computeDependants interface

---
 include/nbl/asset/IAsset.h                    |  4 --
 include/nbl/asset/ICPUAccelerationStructure.h | 31 -------------
 include/nbl/asset/ICPUAnimationLibrary.h      | 17 --------
 include/nbl/asset/ICPUBuffer.h                | 10 -----
 include/nbl/asset/ICPUBufferView.h            | 17 --------
 include/nbl/asset/ICPUComputePipeline.h       | 18 --------
 include/nbl/asset/ICPUDescriptorSet.h         | 43 -------------------
 include/nbl/asset/ICPUDescriptorSetLayout.h   | 22 ----------
 include/nbl/asset/ICPUGraphicsPipeline.h      | 20 ---------
 include/nbl/asset/ICPUImage.h                 | 11 -----
 include/nbl/asset/ICPUImageView.h             | 16 -------
 include/nbl/asset/ICPUMesh.h                  | 11 -----
 include/nbl/asset/ICPUMeshBuffer.h            | 11 -----
 include/nbl/asset/ICPUPipelineCache.h         | 10 -----
 include/nbl/asset/ICPUPipelineLayout.h        | 23 ----------
 include/nbl/asset/ICPURayTracingPipeline.h    | 22 ----------
 include/nbl/asset/ICPURenderpass.h            | 10 -----
 .../asset/ICPURenderpassIndependentPipeline.h | 10 -----
 include/nbl/asset/ICPUSampler.h               | 10 -----
 include/nbl/asset/ICPUSkeleton.h              | 16 -------
 include/nbl/asset/IShader.h                   | 16 -------
 src/nbl/asset/ICPUDescriptorSet.cpp           | 10 -----
 22 files changed, 358 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index cc105f2633..78f96cbbdd 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -175,10 +175,6 @@ class IAsset : virtual public core::IReferenceCounted
 				});
     }
 
-		virtual core::unordered_set<const IAsset*> computeDependants() const = 0;
-
-		virtual core::unordered_set<IAsset*> computeDependants() = 0;
-
     virtual bool valid() const
     {
         //TODO(kevinyu): Temporary set this to true to make changes compile. Will revisit this later for each asset
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 8d02b3ac8b..61a550cd81 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -136,17 +136,6 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		}
 
 
-		// Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents
-		inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return {};
-		}
-
-		inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return {};
-		}
-
 		inline core::blake3_hash_t computeContentHash() const override
 		{
 			if (missingContent())
@@ -272,16 +261,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		//
 		ICPUTopLevelAccelerationStructure() = default;
 
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return computeDependantsImpl(this);
-		}
-
-		inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return computeDependantsImpl(this);
-		}
-
 		//
 		inline auto& getBuildRangeInfo()
 		{
@@ -381,16 +360,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT;
 
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, ICPUTopLevelAccelerationStructure>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        core::unordered_set<asset_ptr_t> dependants;
-        for (const auto& instance : *self->m_instances)
-          dependants.insert(instance.getBase().blas.get());
-        return dependants;
-    }
-
 		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
     {
         for (const auto& instance : *m_instances)
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 33b5b182c9..490c6b6e2e 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -96,25 +96,8 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 		constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-        return computeDependantsImpl(this);
-		}
-
-    inline core::unordered_set<IAsset*> computeDependants() override
-		{
-        return computeDependantsImpl(this);
-		}
-
   private:
 
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, ICPUAnimationLibrary>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        return core::unordered_set<asset_ptr_t>{ self->m_keyframeStorageBinding.buffer.get(), self->m_timestampStorageBinding.buffer.get(), self->m_animationStorageRange.buffer.get() };
-    }
-
 		virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(m_keyframeStorageBinding.buffer.get())) return;
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 94f1dc750a..9b33442a84 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -75,16 +75,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         constexpr static inline auto AssetType = ET_BUFFER;
         inline IAsset::E_TYPE getAssetType() const override final { return AssetType; }
 
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return {};
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return {};
-        }
-
         inline core::blake3_hash_t computeContentHash() const override
         {
             core::blake3_hasher hasher;
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index 1741a1f445..ebe28832f6 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -28,17 +28,6 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 		constexpr static inline auto AssetType = ET_BUFFER_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return computeDependantsImpl(this);
-		}
-
-    inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return computeDependantsImpl(this);
-		}
-
 		ICPUBuffer* getUnderlyingBuffer() 
 		{
 			assert(isMutable());
@@ -61,12 +50,6 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 		virtual ~ICPUBufferView() = default;
 
   private:
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, ICPUBufferView>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        return core::unordered_set<asset_ptr_t>{ self->m_buffer.get() };
-    }
 
 		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
 		{
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 5dbec00ea4..61da031b20 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -28,17 +28,6 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
 
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
-        
-        //!
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return computeDependantsImpl(this);
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return computeDependantsImpl(this);
-        }
 
         inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override
         {
@@ -98,13 +87,6 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
         explicit ICPUComputePipeline(ICPUPipelineLayout* layout):
           base_t(layout, {})
           {}
-
-        template <typename Self>
-          requires(std::same_as<std::remove_cv_t<Self>, ICPUComputePipeline>)
-        static auto computeDependantsImpl(Self* self) {
-            using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-            return core::unordered_set<asset_ptr_t>{ self->m_layout.get(), self->m_specInfo.shader.get() };
-        }
         
         virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
         {
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 05a7f51f60..ee99b3c9e8 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -77,9 +77,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override;
 
-		core::unordered_set<const IAsset*> computeDependants() const override;
-		core::unordered_set<IAsset*> computeDependants() override;
-
 	protected:
 		virtual ~ICPUDescriptorSet() = default;
 
@@ -88,46 +85,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_dynamic_array<ICPUDescriptorSet::SDescriptorInfo> m_descriptorInfos[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT)];
 
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSet>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-
-        using cpu_buffer_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUBuffer*, ICPUBuffer*>;
-        using cpu_sampler_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUSampler*, ICPUSampler*>;
-        using cpu_image_view_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUImageView*, ICPUImageView*>;
-        using cpu_buffer_view_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUBufferView*, ICPUBufferView*>;
-        using cpu_tlas_ptr_t = std::conditional_t<std::is_const_v<Self>, const ICPUTopLevelAccelerationStructure*, ICPUTopLevelAccelerationStructure*>;
-
-        core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get() };
-        for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
-        {
-          if (!self->m_descriptorInfos[i]) continue;
-          const auto size = self->m_descriptorInfos[i]->size();
-          for (auto desc_i = 0u; desc_i < size; desc_i++)
-          {
-            auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get();
-            if (!desc) continue;
-            switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
-            {
-            case IDescriptor::EC_BUFFER:
-              dependants.insert(static_cast<cpu_buffer_ptr_t>(desc));
-            case IDescriptor::EC_SAMPLER:
-              dependants.insert(static_cast<cpu_sampler_ptr_t>(desc));
-            case IDescriptor::EC_IMAGE:
-              dependants.insert(static_cast<cpu_image_view_ptr_t>(desc));
-            case IDescriptor::EC_BUFFER_VIEW:
-              dependants.insert(static_cast<cpu_buffer_view_ptr_t>(desc));
-            case IDescriptor::EC_ACCELERATION_STRUCTURE:
-              dependants.insert(static_cast<cpu_tlas_ptr_t>(desc));
-            default:
-              break;
-            }
-          }
-        }
-        return dependants;
-    }
-
     virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
     {
         for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 8dce4d9db4..871a58395b 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -57,33 +57,11 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
         constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
 
-        core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return computeDependantsImpl(this);
-        }
-
-        core::unordered_set<IAsset*> computeDependants() override
-        {
-            return computeDependantsImpl(this);
-        }
-
 	protected:
 		virtual ~ICPUDescriptorSetLayout() = default;
 
       
   private:
-      template <typename Self>
-        requires(std::same_as<std::remove_cv_t<Self>, ICPUDescriptorSetLayout>)
-      static auto computeDependantsImpl(Self* self) {
-          using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-          core::unordered_set<asset_ptr_t> dependants;
-          if (!self->m_immutableSamplers) return dependants;
-          for (const auto& sampler: *self->m_immutableSamplers)
-          {
-              dependants.insert(sampler.get());
-          }
-          return dependants;
-      }
 
       inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
       {
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 470c5d813b..14a745f65f 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -29,16 +29,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
         constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return computeDependantsImpl(this);
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return computeDependantsImpl(this);
-        }
-
         inline const SCachedCreationParams& getCachedCreationParams() const
         {
             return pipeline_base_t::getCachedCreationParams();
@@ -124,16 +114,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
         }
 
-        template <typename Self>
-          requires(std::same_as<std::remove_cv_t<Self>, ICPUGraphicsPipeline>)
-        static auto computeDependantsImpl(Self* self) {
-            using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-            core::unordered_set<asset_ptr_t> dependants = { self->m_layout.get(), self->m_renderpass.get()};
-            for (const auto& info : self->m_specInfos)
-              if (info.shader) dependants.insert(info.shader.get());
-            return dependants;
-        }
-
         inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
             auto* newPipeline = new ICPUGraphicsPipeline(layout.get(), m_renderpass.get());
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index f13d75b76a..e3a0d8558f 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -45,17 +45,6 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 		constexpr static inline auto AssetType = ET_IMAGE;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-		// Do not report buffer as dependant, as we will simply drop it instead of discarding its contents!
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-        return {};
-		}
-
-    inline core::unordered_set<IAsset*> computeDependants() override
-		{
-        return {};
-		}
-
 		core::blake3_hash_t computeContentHash() const override;
 
 		// Having regions specififed to upload is optional! So to have content missing we must have regions but no buffer content
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 74cb143fe6..f30489bdfd 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -49,16 +49,6 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 		constexpr static inline auto AssetType = ET_IMAGE_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return computeDependantsImpl(this);
-		}
-
-    inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return computeDependantsImpl(this);
-		}
-
 		//!
 		const SComponentMapping& getComponents() const { return params.components; }
 		SComponentMapping&	getComponents() 
@@ -76,12 +66,6 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 		virtual ~ICPUImageView() = default;
 
   private:
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, ICPUImageView>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        return core::unordered_set<asset_ptr_t>{ self->params.image.get() };
-    }
 
     inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
     {
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index f52db5055e..2a65dc4e17 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -81,17 +81,6 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
             return cp;
         }
 
-        //! CLASS IS DEPRECATED ANYWAY
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return {};
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return {};
-        }
-
 	protected:
 
 	private:
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 9872cc6b10..6f4b7f074c 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -611,17 +611,6 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             return const_cast<core::aabbox3df*>(const_cast<const ICPUMeshBuffer*>(this)->getJointAABBs());
         }
 
-        //! Class is deprecated anyway.
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return {};
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return {};
-        }
-
         inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
         {
         }
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 85ac650a22..217499170a 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -60,16 +60,6 @@ class ICPUPipelineCache final : public IPreHashed
 			return core::make_smart_refctd_ptr<ICPUPipelineCache>(std::move(cache_cp));
 		}
 
-		inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return {};
-		}
-
-		inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return {};
-		}
-
 		//
 		inline core::blake3_hash_t computeContentHash() const override
 		{
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index c7d835faae..cfab4e7360 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -30,16 +30,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
             core::smart_refctd_ptr<ICPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<ICPUDescriptorSetLayout>&& _layout3
         ) : IPipelineLayout<ICPUDescriptorSetLayout>(_pcRanges,std::move(_layout0),std::move(_layout1),std::move(_layout2),std::move(_layout3)) {}
 
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return computeDependantsImpl(this);
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return computeDependantsImpl(this);
-        }
-
         //
 		ICPUDescriptorSetLayout* getDescriptorSetLayout(uint32_t _set) 
         {
@@ -79,19 +69,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
-      template <typename Self>
-        requires(std::same_as<std::remove_cv_t<Self>, ICPUPipelineLayout>)
-      static auto computeDependantsImpl(Self* self) {
-          using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-          core::unordered_set<asset_ptr_t> dependants;
-          for (auto i = 0; i < self->m_descSetLayouts.size(); i++)
-          {
-              if (self->m_descSetLayouts[i]) continue;
-              dependants.insert(self->m_descSetLayouts[i].get());
-          }
-          return dependants;
-      }
-
       inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
       {
           for (auto i = 0; i < m_descSetLayouts.size(); i++)
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 09101c73ee..5e8e55b5e9 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -36,14 +36,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        virtual core::unordered_set<const IAsset*> computeDependants() const override final {
-            return computeDependantsImpl(this);
-        }
-
-        virtual core::unordered_set<IAsset*> computeDependants() override final {
-            return computeDependantsImpl(this);
-        }
-
         inline virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
         {
             switch (stage) 
@@ -124,20 +116,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             : base_t(layout, {})
             {}
 
-        template <typename Self>
-          requires(std::same_as<std::remove_cv_t<Self>, ICPURayTracingPipeline>)
-        static auto computeDependantsImpl(Self* self) {
-            using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-            core::unordered_set<asset_ptr_t> dependants;
-            dependants.insert(self->m_raygen.shader.get());
-            for (const auto& missInfo : self->m_misses) dependants.insert(missInfo.shader.get());
-            for (const auto& anyHitInfo : self->m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get());
-            for (const auto& closestHitInfo : self->m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get());
-            for (const auto& intersectionInfo : self->m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get());
-            for (const auto& callableInfo : self->m_callables) dependants.insert(callableInfo.shader.get());
-            return dependants;
-        }
-
         inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
         {
             core::unordered_set<const IAsset*> dependants;
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index 517ffbe766..39fe388427 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -38,16 +38,6 @@ class ICPURenderpass : public IRenderpass, public IAsset
             return ET_RENDERPASS;
         }
 
-        inline core::unordered_set<const IAsset*> computeDependants() const override
-        {
-            return {};
-        }
-
-        inline core::unordered_set<IAsset*> computeDependants() override
-        {
-            return {};
-        }
-
     protected:
         inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {}
         inline ~ICPURenderpass() = default;
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index feb04cd1c4..6db56fa279 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -72,16 +72,6 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 		_NBL_STATIC_INLINE_CONSTEXPR auto AssetType = ET_RENDERPASS_INDEPENDENT_PIPELINE;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-		inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return {};
-		}
-
-		inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return {};
-		}
-
 		//
 		inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();}
 		inline SCachedCreationParams& getCachedCreationParams()
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index d2ef756cad..4df7eb9ab5 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -69,16 +69,6 @@ class ICPUSampler : public ISampler, public IAsset
 		constexpr static inline auto AssetType = ET_SAMPLER;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 		
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-        return {};
-		}
-
-    inline core::unordered_set<IAsset*> computeDependants() override
-		{
-        return {};
-		}
-
   private:
 
     inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index fb5c5953e0..0d6f0d405b 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -79,23 +79,7 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 		constexpr static inline auto AssetType = ET_SKELETON;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-    inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-        return computeDependantsImpl(this);
-		}
-
-    inline core::unordered_set<IAsset*> computeDependants() override
-		{
-        return computeDependantsImpl(this);
-		}
-
   private:
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, ICPUSkeleton>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        return core::unordered_set<asset_ptr_t>{ self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() };
-    }
 
 		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
 		{
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 4574ac073a..8ce332cb99 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -50,16 +50,6 @@ class IShader : public IAsset
 		constexpr static inline auto AssetType = ET_SHADER;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 		
-		inline core::unordered_set<const IAsset*> computeDependants() const override
-		{
-			return computeDependantsImpl(this);
-		}
-
-		inline core::unordered_set<IAsset*> computeDependants() override
-		{
-			return computeDependantsImpl(this);
-		}
-		
 		//
 		inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth=~0u) const override
 		{
@@ -108,12 +98,6 @@ class IShader : public IAsset
 		E_CONTENT_TYPE m_contentType;
 
   private:
-    template <typename Self>
-      requires(std::same_as<std::remove_cv_t<Self>, IShader>)
-    static auto computeDependantsImpl(Self* self) {
-        using asset_ptr_t = std::conditional_t<std::is_const_v<Self>, const IAsset*, IAsset*>;
-        return core::unordered_set<asset_ptr_t>{self->m_code.get()};
-    }
 
     inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
     {
diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp
index 730f0847f2..7137edcba5 100644
--- a/src/nbl/asset/ICPUDescriptorSet.cpp
+++ b/src/nbl/asset/ICPUDescriptorSet.cpp
@@ -108,14 +108,4 @@ core::smart_refctd_ptr<IAsset> ICPUDescriptorSet::clone(uint32_t _depth) const
 	return cp;
 }
 
-core::unordered_set<const IAsset*> ICPUDescriptorSet::computeDependants() const
-{
-	return computeDependantsImpl(this);
-}
-
-core::unordered_set<IAsset*> ICPUDescriptorSet::computeDependants()
-{
-	return computeDependantsImpl(this);
-}
-
 }
\ No newline at end of file

From 6a84bd7cf9ff9301ed43250681df131a69d15ca2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 16:32:43 +0700
Subject: [PATCH 306/346] Rename visitDependentsImpl to visitDependents_impl

---
 include/nbl/asset/IAsset.h                    |  4 ++--
 include/nbl/asset/ICPUAccelerationStructure.h |  4 ++--
 include/nbl/asset/ICPUAnimationLibrary.h      |  2 +-
 include/nbl/asset/ICPUBuffer.h                |  2 +-
 include/nbl/asset/ICPUBufferView.h            |  2 +-
 include/nbl/asset/ICPUComputePipeline.h       |  2 +-
 include/nbl/asset/ICPUDescriptorSet.h         |  2 +-
 include/nbl/asset/ICPUDescriptorSetLayout.h   |  2 +-
 include/nbl/asset/ICPUGraphicsPipeline.h      |  2 +-
 include/nbl/asset/ICPUImage.h                 |  2 +-
 include/nbl/asset/ICPUImageView.h             |  2 +-
 include/nbl/asset/ICPUMesh.h                  |  2 +-
 include/nbl/asset/ICPUMeshBuffer.h            |  2 +-
 include/nbl/asset/ICPUPipelineCache.h         |  2 +-
 include/nbl/asset/ICPUPipelineLayout.h        |  2 +-
 include/nbl/asset/ICPURayTracingPipeline.h    | 21 +++++++------------
 include/nbl/asset/ICPURenderpass.h            |  2 +-
 .../asset/ICPURenderpassIndependentPipeline.h |  2 +-
 include/nbl/asset/ICPUSampler.h               |  2 +-
 include/nbl/asset/ICPUSkeleton.h              |  2 +-
 include/nbl/asset/IShader.h                   |  2 +-
 21 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index 78f96cbbdd..2e45f62bbb 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -158,7 +158,7 @@ class IAsset : virtual public core::IReferenceCounted
 
 		inline void visitDependents(std::function<bool(const IAsset*)> visit) const
 		{
-				visitDependentsImpl([&visit](const IAsset* dep)->bool
+				visitDependents_impl([&visit](const IAsset* dep)->bool
         {
             if (dep)
                 return visit(dep);
@@ -190,7 +190,7 @@ class IAsset : virtual public core::IReferenceCounted
 		friend IAssetManager;
 		bool m_mutable = true;
 
-		virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const = 0;
+		virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const = 0;
 };
 
 template<typename T>
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 61a550cd81..3836690bda 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -248,7 +248,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		core::smart_refctd_dynamic_array<uint32_t> m_geometryPrimitiveCount = nullptr;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT;
 
-		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override {}
+		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
 };
 
 class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure
@@ -360,7 +360,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT;
 
-		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         for (const auto& instance : *m_instances)
             if (!visit(instance.getBase().blas.get())) return;
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 490c6b6e2e..2d620f562c 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -98,7 +98,7 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 
   private:
 
-		virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(m_keyframeStorageBinding.buffer.get())) return;
         if (!visit(m_timestampStorageBinding.buffer.get())) return;
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 9b33442a84..30232c061a 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -129,7 +129,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         discardContent_impl();
     }
 
-    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override {}
+    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
 
     void* m_data;
     core::smart_refctd_ptr<core::refctd_memory_resource> m_mem_resource;
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index ebe28832f6..512103a9cd 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -51,7 +51,7 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 
   private:
 
-		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
         if (!visit(m_buffer.get())) return;
 		}
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 61da031b20..02b56d02ce 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -88,7 +88,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
           base_t(layout, {})
           {}
         
-        virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
             if (!visit(m_layout.get())) return;
             if (!visit(m_specInfo.shader.get())) return;
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index ee99b3c9e8..b0d9786868 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -85,7 +85,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_dynamic_array<ICPUDescriptorSet::SDescriptorInfo> m_descriptorInfos[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT)];
 
-    virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
         {
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 871a58395b..216297a562 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -63,7 +63,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
       
   private:
 
-      inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+      inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
       {
           if (m_immutableSamplers) return;
           for (const auto& sampler : *m_immutableSamplers)
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 14a745f65f..f39f38f673 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -127,7 +127,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
 
-        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
             if (!visit(m_layout.get())) return;
             if (!visit(m_renderpass.get())) return;
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index e3a0d8558f..847b796da0 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -217,7 +217,7 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 			}
 		};
 
-    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
     }
 };
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index f30489bdfd..6338021aed 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -67,7 +67,7 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 
   private:
 
-    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(params.image.get())) return;
     }
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index 2a65dc4e17..019578775c 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -86,7 +86,7 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
 	private:
 		core::vector<core::smart_refctd_ptr<ICPUMeshBuffer>> m_meshBuffers;
 
-        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
         }
 };
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 6f4b7f074c..8fc5ae26e9 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -611,7 +611,7 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             return const_cast<core::aabbox3df*>(const_cast<const ICPUMeshBuffer*>(this)->getJointAABBs());
         }
 
-        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
         }
 };
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 217499170a..a0d4373c6e 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -93,7 +93,7 @@ class ICPUPipelineCache final : public IPreHashed
 	private:
 		entries_map_t m_cache;
 
-		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
 		}
 };
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index cfab4e7360..f4f636601c 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -69,7 +69,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
-      inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+      inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
       {
           for (auto i = 0; i < m_descSetLayouts.size(); i++)
           {
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 5e8e55b5e9..955275f819 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -116,21 +116,14 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             : base_t(layout, {})
             {}
 
-        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
-            core::unordered_set<const IAsset*> dependants;
-            const auto visitOnce = [&](const IAsset* dep) -> bool {
-                auto [iter, inserted] = dependants.insert(dep);
-                if (inserted) return visit(dep);
-                return true;
-            };
-            visitOnce(m_raygen.shader.get());
-            for (const auto& missInfo : self->m_misses) visitOnce(missInfo.shader.get());
-            for (const auto& anyHitInfo : self->m_hitGroups.anyHits) visitOnce(anyHitInfo.shader.get());
-            for (const auto& closestHitInfo : self->m_hitGroups.closestHits) visitOnce(closestHitInfo.shader.get());
-            for (const auto& intersectionInfo : self->m_hitGroups.intersections) visitOnce(intersectionInfo.shader.get());
-            for (const auto& callableInfo : self->m_callables) visitOnce(callableInfo.shader.get());
-
+            if (!visit(m_raygen.shader.get()) return;
+            for (const auto& missInfo : self->m_misses) if (!visit(missInfo.shader.get())) return;
+            for (const auto& anyHitInfo : self->m_hitGroups.anyHits) if (!visit(anyHitInfo.shader.get())) return;
+            for (const auto& closestHitInfo : self->m_hitGroups.closestHits) if (!visit(closestHitInfo.shader.get())) return;
+            for (const auto& intersectionInfo : self->m_hitGroups.intersections) if (!visit(intersectionInfo.shader.get())) return;
+            for (const auto& callableInfo : self->m_callables) if(!visit(callableInfo.shader.get())) return;
         }
 
         inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index 39fe388427..7622609789 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -44,7 +44,7 @@ class ICPURenderpass : public IRenderpass, public IAsset
 
     private:
 
-        inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
         }
 
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index 6db56fa279..422cf548b4 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -150,7 +150,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 
   private:
 
-    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
     }
 };
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index 4df7eb9ab5..6ddf479319 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -71,7 +71,7 @@ class ICPUSampler : public ISampler, public IAsset
 		
   private:
 
-    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
     }
 };
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 0d6f0d405b..e66293da0c 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -81,7 +81,7 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 
   private:
 
-		inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
         if (!visit(m_defaultTransforms.buffer.get())) return;
 				if (!visit(m_parentJointIDs.buffer.get())) return;
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 8ce332cb99..3ef14f3e78 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -99,7 +99,7 @@ class IShader : public IAsset
 
   private:
 
-    inline virtual void visitDependentsImpl(std::function<bool(const IAsset*)> visit) const override
+    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(m_code.get())) return;
     }

From d58554e8d03b9bcd6ba3ada56887dfb06b9ae04c Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:13:31 +0700
Subject: [PATCH 307/346] Fix visitDependents_impl on some asset

---
 include/nbl/asset/ICPUAccelerationStructure.h |  1 +
 include/nbl/asset/ICPUDescriptorSetLayout.h   |  2 +-
 include/nbl/asset/ICPUPipelineLayout.h        | 12 +++++++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 3836690bda..4c837dc91a 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -362,6 +362,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 
 		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
+        if (!m_instances) return;
         for (const auto& instance : *m_instances)
             if (!visit(instance.getBase().blas.get())) return;
     }
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 216297a562..19e38a26b2 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -65,7 +65,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
 
       inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
       {
-          if (m_immutableSamplers) return;
+          if (!m_immutableSamplers) return;
           for (const auto& sampler : *m_immutableSamplers)
               if (!visit(sampler.get())) return;
       }
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index f4f636601c..0684980cf8 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -66,6 +66,16 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
         static inline constexpr auto AssetType = ET_PIPELINE_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
 
+        inline virtual bool valid() const override
+        {
+            for (auto i = 0; i < m_descSetLayouts.size(); i++)
+            {
+                if (!m_descSetLayouts[i]) continue;
+                if (!m_descSetLayouts[i]->valid()) return false;
+            }
+            return true;
+        }
+
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
@@ -73,7 +83,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
       {
           for (auto i = 0; i < m_descSetLayouts.size(); i++)
           {
-              if (m_descSetLayouts[i]) continue;
+              if (!m_descSetLayouts[i]) continue;
               if (!visit(m_descSetLayouts[i].get())) return;
           }
       }

From 4fcd6e2998a7fc45ad9830c1e135f5fbcbf1cd1f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:15:57 +0700
Subject: [PATCH 308/346] Implement ICPUBuffer valid()

---
 include/nbl/asset/ICPUBuffer.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 30232c061a..66170ac20d 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -110,6 +110,14 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
             return true;
         }
 
+        inline virtual bool valid() const override
+        {
+            if (!m_data) return false;
+            if (!m_mem_resource) return false;
+            // check if alignment is power of two
+            return (m_alignment > 0 && !(m_alignment & (m_alignment - 1)));
+        }
+
 protected:
     inline void discardContent_impl() override
     {

From c3c50b43a861c9ef7d9485fe3f759a21f14e041e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:16:12 +0700
Subject: [PATCH 309/346] Implement ICPUBufferView::valid()

---
 include/nbl/asset/ICPUBufferView.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index 512103a9cd..c96f0377f4 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -46,6 +46,16 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 			m_size = _size;
 		}
 
+    inline virtual bool valid() const override
+    {
+        if (!m_buffer->valid()) return false;
+        if (m_offset >= m_buffer->getSize()) return false;
+        if (m_size <= 0) return false;
+        if (m_offset >= m_buffer->getSize()) return false;
+        if (m_size > m_buffer->getSize() - m_offset) return false;
+				return true;
+    }
+
 	protected:
 		virtual ~ICPUBufferView() = default;
 

From 51e408b002bea254a6526d4b2af6eb913ba98169 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:16:25 +0700
Subject: [PATCH 310/346] Implement ICPUDescriptorSet::valid()

---
 include/nbl/asset/ICPUDescriptorSet.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index b0d9786868..776e4e1409 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -77,6 +77,11 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override;
 
+		inline virtual bool valid() const override {
+			if (!m_layout->valid()) return false;
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUDescriptorSet() = default;
 

From 345dbd8ff5a99536c4ed7b2397189888bfa8d359 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:16:38 +0700
Subject: [PATCH 311/346] Implement ICPUDescriptorSetLayout::valid()

---
 include/nbl/asset/ICPUDescriptorSetLayout.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 19e38a26b2..da249620bc 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -56,6 +56,10 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
 
         constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
+        inline virtual bool valid() const override
+        {
+            return true; // no modification is possible after creation
+        }
 
 	protected:
 		virtual ~ICPUDescriptorSetLayout() = default;

From 4a4b51d1d4b2df7e09c901a3ac0f3dbc1721827c Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:16:49 +0700
Subject: [PATCH 312/346] Implement ICPUImage::valid()

---
 include/nbl/asset/ICPUImage.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index 847b796da0..78c7c4891f 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -195,6 +195,16 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 			return true;
 		}
 
+	  inline virtual bool valid() const override
+		{
+			if (!validateCreationParameters(m_creationParams)) return false;
+			if (info != m_creationParams.format) return false;
+			if (!buffer->valid()) return false;
+			for (const auto& region : regions)
+				if (!region.isValid()) return false;
+			return true;
+		}
+
     protected:
 		inline ICPUImage(const SCreationParams& _params) : IImage(_params) {}
 		virtual ~ICPUImage() = default;

From c5d1d85e120a312f959de7c087c27580a4ef7fa8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:17:03 +0700
Subject: [PATCH 313/346] Implement ICPUImageView::valid()

---
 include/nbl/asset/ICPUImageView.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 6338021aed..953651c604 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -62,6 +62,17 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 			params.subresourceRange.aspectMask = aspect.value;
 		}
 
+    inline virtual bool valid() const override
+		{
+			if (!validateCreationParameters(params)) return false;
+
+			// image nullptr already checked in validateCreationParameters;
+			assert(params.image);
+			if (!params.image->valid()) return false;
+
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUImageView() = default;
 

From 70870997f7fa776fd4c3ef059b702a7f94c9b1d5 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:17:16 +0700
Subject: [PATCH 314/346] Implement ICPUPipelineCache::valid()

---
 include/nbl/asset/ICPUPipelineCache.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index a0d4373c6e..702b86620e 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -83,6 +83,11 @@ class ICPUPipelineCache final : public IPreHashed
 		//
 		const auto& getEntries() const {return m_cache;}
 
+		inline virtual bool valid() const override
+		{
+			return true;
+		}
+
 	protected:
 		inline void discardContent_impl() override
 		{

From 73a17a07bc70ade6d45129f4f752c71d787bdfd4 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 19:21:51 +0700
Subject: [PATCH 315/346] Implement ICPUSampler::valid()

---
 include/nbl/asset/ICPURenderpass.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index 7622609789..a131b44add 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -38,6 +38,12 @@ class ICPURenderpass : public IRenderpass, public IAsset
             return ET_RENDERPASS;
         }
 
+        inline virtual bool valid() const override
+        {
+            // no modification is possible after creation. parameter is validated when creating renderpass
+            return true;
+        }
+
     protected:
         inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {}
         inline ~ICPURenderpass() = default;

From c7cff1dcefc61879c0a168bb899e357cdc0e609d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 12:31:57 +0700
Subject: [PATCH 316/346] Remove valid implementation on IAsset and implement
 valid for all derived class of IAsset

---
 include/nbl/asset/IAsset.h                    |  6 +--
 include/nbl/asset/ICPUAccelerationStructure.h | 38 +++++++++++++++++++
 include/nbl/asset/ICPUAnimationLibrary.h      |  1 +
 include/nbl/asset/ICPUImage.h                 |  7 ++--
 include/nbl/asset/ICPUMesh.h                  | 10 +++++
 include/nbl/asset/ICPUMeshBuffer.h            |  5 +++
 .../asset/ICPURenderpassIndependentPipeline.h |  5 +++
 include/nbl/asset/ICPUSampler.h               |  1 +
 include/nbl/asset/ICPUSkeleton.h              |  1 +
 include/nbl/asset/IShader.h                   |  8 ++++
 10 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index 2e45f62bbb..dc77931c25 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -175,11 +175,7 @@ class IAsset : virtual public core::IReferenceCounted
 				});
     }
 
-    virtual bool valid() const
-    {
-        //TODO(kevinyu): Temporary set this to true to make changes compile. Will revisit this later for each asset
-        return true;
-    }
+		virtual bool valid() const = 0;
 
     protected:
 		inline IAsset() = default;
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 4c837dc91a..2c4933d36c 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -231,6 +231,33 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 			return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms;
 		}
 
+		inline virtual bool valid() const override
+		{
+			if (!validBuildFlags(m_buildFlags)) return false;
+
+			size_t geometryCount = 0;
+			if (m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+			{
+				if (!m_AABBGeoms || m_triangleGeoms) return false;
+				geometryCount = m_AABBGeoms->size();
+			}
+			else
+			{
+				if (!m_triangleGeoms || m_AABBGeoms) return false;
+				geometryCount = m_triangleGeoms->size();
+			}
+
+      // https://registry.khronos.org/vulkan/specs/latest/man/html/vkGetAccelerationStructureBuildSizesKHR.html#VUID-vkGetAccelerationStructureBuildSizesKHR-pBuildInfo-03619
+			if (geometryCount == 0) {
+				if (m_geometryPrimitiveCount && m_geometryPrimitiveCount->size() > 0) return false;
+			}
+		  else
+			{
+				if (!m_geometryPrimitiveCount || m_geometryPrimitiveCount->size() != geometryCount) return false;
+			}
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUBottomLevelAccelerationStructure() = default;
 
@@ -352,6 +379,17 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 			return cp;
 		}
 
+		inline virtual bool valid() const override
+		{
+			if (!validBuildFlags(m_buildFlags)) return false;
+			if (!m_instances) return false;
+			for (const auto& instance : *m_instances)
+				if (!instance.getBase().blas->valid()) return false;
+			if (m_buildRangeInfo.instanceCount != m_instances->size()) return false;
+			if (m_buildRangeInfo.instanceByteOffset % 16 != 0) return false;
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUTopLevelAccelerationStructure() = default;
 
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 2d620f562c..bcaae3bf3e 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -95,6 +95,7 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 
 		constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY;
 		inline E_TYPE getAssetType() const override { return AssetType; }
+		inline virtual bool valid() const override { return true; }
 
   private:
 
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index 78c7c4891f..01ee3d41e0 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -199,9 +199,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 		{
 			if (!validateCreationParameters(m_creationParams)) return false;
 			if (info != m_creationParams.format) return false;
-			if (!buffer->valid()) return false;
-			for (const auto& region : regions)
-				if (!region.isValid()) return false;
+			if (buffer && !buffer->valid()) return false;
+			if (regions)
+				for (const auto& region : *regions)
+					if (!region.isValid()) return false;
 			return true;
 		}
 
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index 019578775c..0f780ef437 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -81,6 +81,16 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
             return cp;
         }
 
+    inline virtual bool valid() const override
+    {
+      for (const auto& meshBuffer : m_meshBuffers)
+      {
+        if (!meshBuffer) return false;
+        if (!meshBuffer->valid()) return false;
+      }
+      return true;
+    }
+
 	protected:
 
 	private:
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 8fc5ae26e9..6bd3cd5700 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -610,7 +610,12 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             assert(isMutable());
             return const_cast<core::aabbox3df*>(const_cast<const ICPUMeshBuffer*>(this)->getJointAABBs());
         }
+        inline virtual bool valid() const override
+        {
+            return true;
+        }
 
+    private:
         inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
         }
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index 422cf548b4..b349aab888 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -93,6 +93,11 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 			m_layout = std::move(_layout);
 		}
 
+    inline virtual bool valid() const override
+    {
+      return m_layout && m_layout->valid();
+    }
+
 #if 0
 		// The getters are weird because the shader pointer needs patching
 		inline IShader::SSpecInfo<ICPUShader> getSpecInfos(const hlsl::ShaderStage stage)
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index 6ddf479319..8db568f26b 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -68,6 +68,7 @@ class ICPUSampler : public ISampler, public IAsset
 
 		constexpr static inline auto AssetType = ET_SAMPLER;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
+		inline virtual bool valid() const override { return true; }
 		
   private:
 
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index e66293da0c..361468d1c5 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -78,6 +78,7 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 
 		constexpr static inline auto AssetType = ET_SKELETON;
 		inline E_TYPE getAssetType() const override { return AssetType; }
+		inline virtual bool valid() const override { return true; }
 
   private:
 
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 3ef14f3e78..d1ae2e0c86 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -87,6 +87,14 @@ class IShader : public IAsset
 
 		// TODO: `void setContent(core::smart_refctd_ptr<const ICPUBuffer>&&,const E_CONTENT_TYPE)`
 
+		inline virtual bool valid() const override
+		{
+			if (!m_code) return false;
+			if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false;
+			// Note(kevyuu) : Should we check for m_filepathHint if content type is not spirv. What if no pragma includ in the source code. Do we even need m_filepathHint in that case?
+			return true;
+		}
+
 		// alias for legacy reasons
 		using E_SHADER_STAGE = hlsl::ShaderStage;
 

From 437c19408a3e5900f4a69fbc1f5ed7a9544e18eb Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 16 Jun 2025 15:10:06 +0700
Subject: [PATCH 317/346] use x-macros for config compat between hlsl and cpp

---
 examples_tests                                |   2 +-
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 165 +++++++++++++-----
 .../impl/arithmetic_config_def.hlsl           |  34 ++++
 .../workgroup2/impl/items_per_invoc_def.hlsl  |   8 +
 .../workgroup2/impl/virtual_wg_size_def.hlsl  |   8 +
 5 files changed, 176 insertions(+), 41 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
 create mode 100644 include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
 create mode 100644 include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl

diff --git a/examples_tests b/examples_tests
index 1710b69862..4c10dc1cdb 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8
+Subproject commit 4c10dc1cdba4ab12dfedef97768aa4a10e606213
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index f894eac58a..6eb6a535fe 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -6,6 +6,7 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/tuple.hlsl"
+#include "nbl/builtin/hlsl/mpl.hlsl"
 
 namespace nbl 
 {
@@ -19,23 +20,37 @@ namespace impl
 template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define DEFINE_VIRTUAL_WG_T(ID) ID
+    #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/virtual_wg_size_def.hlsl"
+    #undef DEFINE_COND_VAL
+    #undef DEFINE_MPL_MAX_V
+    #undef DEFINE_VIRTUAL_WG_T
+    #undef DEFINE_ASSIGN
+    
+    // must have at least enough level 0 outputs to feed a single subgroup
     static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
     static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16");
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, SubgroupSizeLog2*levels, WorkgroupSizeLog2>;
-    // must have at least enough level 0 outputs to feed a single subgroup
 };
 
 template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation>
 struct items_per_invocation
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,VirtualWorkgroup::WorkgroupSizeLog2-VirtualWorkgroup::SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define DEFINE_VIRTUAL_WG_T(ID) VirtualWorkgroup::ID
+    #define DEFINE_ITEMS_INVOC_T(ID) ID
+    #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) mpl::min_v<TYPE, ARG1, ARG2>
+    #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/items_per_invoc_def.hlsl"
+    #undef DEFINE_COND_VAL
+    #undef DEFINE_MPL_MAX_V
+    #undef DEFINE_MPL_MIN_V
+    #undef DEFINE_ITEMS_INVOC_T
+    #undef DEFINE_VIRTUAL_WG_T
+    #undef DEFINE_ASSIGN
 
     using ItemsPerInvocation = tuple<integral_constant<uint16_t,value0>,integral_constant<uint16_t,value1>,integral_constant<uint16_t,value2> >;
 };
@@ -44,47 +59,35 @@ struct items_per_invocation
 template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
 struct ArithmeticConfiguration
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
-
-    using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
-    static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
-
+    using virtual_wg_t = impl::virtual_wg_size_log2<_WorkgroupSizeLog2, _SubgroupSizeLog2>;
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
     using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = tuple_element<0,ItemsPerInvocation>::type::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = tuple_element<1,ItemsPerInvocation>::type::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = tuple_element<2,ItemsPerInvocation>::type::value;
-    static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value<LevelCount==3,uint16_t,
-        mpl::max_v<uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize>,
-        SubgroupSize*ItemsPerInvocation_1>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1;
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define DEFINE_VIRTUAL_WG_T(ID) virtual_wg_t::ID
+    #define DEFINE_ITEMS_INVOC_T(ID) items_per_invoc_t::ID
+    #define DEFINE_CONFIG_T(ID) ID
+    #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/arithmetic_config_def.hlsl"
+    #undef DEFINE_COND_VAL
+    #undef DEFINE_MPL_MAX_V
+    #undef DEFINE_CONFIG_T
+    #undef DEFINE_ITEMS_INVOC_T
+    #undef DEFINE_VIRTUAL_WG_T
+    #undef DEFINE_ASSIGN
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3,uint16_t,SubgroupSize-1,0>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3,uint16_t,VirtualInvocationsAtLevel1,SubgroupSize>::value + __padding;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize,0>::value;
     using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
 
-    // user specified the shared mem size of Scalars
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
-        0,
-        conditional_value<LevelCount==3,uint16_t,
-            LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1,
-            0
-            >::value + LevelInputCount_1
-        >::value;
+    static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
+    static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
 
+#ifdef __HLSL_VERSION
     static bool electLast()
     {
         return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
     }
+#endif
 
     // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups
     // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex
@@ -140,6 +143,88 @@ struct ArithmeticConfiguration
     }
 };
 
+#ifndef __HLSL_VERSION
+namespace impl
+{
+struct SVirtualWGSizeLog2
+{
+    static SVirtualWGSizeLog2 create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2)
+    {
+        SVirtualWGSizeLog2 retval;
+        #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__;
+        #define DEFINE_VIRTUAL_WG_T(ID) retval.ID
+        #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/virtual_wg_size_def.hlsl"
+        #undef DEFINE_COND_VAL
+        #undef DEFINE_MPL_MAX_V
+        #undef DEFINE_VIRTUAL_WG_T
+        #undef DEFINE_ASSIGN
+        return retval;
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/virtual_wg_size_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+
+struct SItemsPerInvoc
+{
+    static SItemsPerInvoc create(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation)
+    {
+        SItemsPerInvoc retval;
+        #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__;
+        #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID
+        #define DEFINE_ITEMS_INVOC_T(ID) retval.ID
+        #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) hlsl::min<TYPE>(ARG1, ARG2)
+        #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/items_per_invoc_def.hlsl"
+        #undef DEFINE_COND_VAL
+        #undef DEFINE_MPL_MAX_V
+        #undef DEFINE_MPL_MIN_V
+        #undef DEFINE_ITEMS_INVOC_T
+        #undef DEFINE_VIRTUAL_WG_T
+        #undef DEFINE_ASSIGN
+        return retval;
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/items_per_invoc_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+}
+
+struct SArithmeticConfiguration
+{
+    static SArithmeticConfiguration create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation)
+    {
+        impl::SVirtualWGSizeLog2 virtualWgSizeLog2 = impl::SVirtualWGSizeLog2::create(_WorkgroupSizeLog2, _SubgroupSizeLog2);
+        impl::SItemsPerInvoc itemsPerInvoc = impl::SItemsPerInvoc::create(virtualWgSizeLog2, _ItemsPerInvocation);
+
+        SArithmeticConfiguration retval;
+        #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__;
+        #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID
+        #define DEFINE_ITEMS_INVOC_T(ID) itemsPerInvoc.ID
+        #define DEFINE_CONFIG_T(ID) retval.ID
+        #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/arithmetic_config_def.hlsl"
+        #undef DEFINE_COND_VAL
+        #undef DEFINE_MPL_MAX_V
+        #undef DEFINE_CONFIG_T
+        #undef DEFINE_ITEMS_INVOC_T
+        #undef DEFINE_VIRTUAL_WG_T
+        #undef DEFINE_ASSIGN
+        return retval;
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/arithmetic_config_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+#endif
+
 template<class T>
 struct is_configuration : bool_constant<false> {};
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
new file mode 100644
index 0000000000..4ea6fc010d
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
@@ -0,0 +1,34 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(WorkgroupSizeLog2))
+DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(SubgroupSizeLog2))
+
+DEFINE_ASSIGN(uint16_t, LevelCount, DEFINE_VIRTUAL_WG_T(levels))
+DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << DEFINE_VIRTUAL_WG_T(value))
+
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, DEFINE_ITEMS_INVOC_T(value0))
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, DEFINE_ITEMS_INVOC_T(value1))
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, DEFINE_ITEMS_INVOC_T(value2))
+
+DEFINE_ASSIGN(uint16_t, LevelInputCount_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),
+    DEFINE_MPL_MAX_V(uint16_t, (DEFINE_CONFIG_T(VirtualWorkgroupSize)>>DEFINE_CONFIG_T(SubgroupSizeLog2)), DEFINE_CONFIG_T(SubgroupSize)),
+    DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1)))
+DEFINE_ASSIGN(uint16_t, LevelInputCount_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_2),0))
+DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, DEFINE_CONFIG_T(LevelInputCount_1) / DEFINE_CONFIG_T(ItemsPerInvocation_1))
+
+DEFINE_ASSIGN(uint16_t, __padding, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)-1,0))
+DEFINE_ASSIGN(uint16_t, __channelStride_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(VirtualInvocationsAtLevel1),DEFINE_CONFIG_T(SubgroupSize)) + DEFINE_CONFIG_T(__padding))
+DEFINE_ASSIGN(uint16_t, __channelStride_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize),0))
+
+// user specified the shared mem size of Scalars
+DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==1),
+    0,
+    DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),
+        DEFINE_CONFIG_T(LevelInputCount_2)+(DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1))-1,
+        0
+        ) + DEFINE_CONFIG_T(LevelInputCount_1)
+    ))
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
new file mode 100644
index 0000000000..857b64d774
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
@@ -0,0 +1,8 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, DEFINE_MPL_MAX_V(int16_t,DEFINE_VIRTUAL_WG_T(WorkgroupSizeLog2)-DEFINE_VIRTUAL_WG_T(SubgroupSizeLog2)*DEFINE_VIRTUAL_WG_T(levels),0))
+DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation)
+DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << DEFINE_COND_VAL(uint16_t,(DEFINE_VIRTUAL_WG_T(levels)==3),DEFINE_MPL_MIN_V(uint16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2),2),DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2)))
+DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << DEFINE_MPL_MAX_V(int16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2)-2,0))
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
new file mode 100644
index 0000000000..3190ba5df3
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
@@ -0,0 +1,8 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, levels, DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1))
+DEFINE_ASSIGN(uint16_t, value, DEFINE_MPL_MAX_V(uint16_t, _SubgroupSizeLog2*DEFINE_VIRTUAL_WG_T(levels), _WorkgroupSizeLog2))

From ae3946e5299f28e064bcd14870c4a6c1eb2f18c0 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 15:37:44 +0700
Subject: [PATCH 318/346] Add comment to some valid logic of top acceleration
 structure

---
 include/nbl/asset/ICPUAccelerationStructure.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 2c4933d36c..feddcbb35f 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -386,6 +386,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 			for (const auto& instance : *m_instances)
 				if (!instance.getBase().blas->valid()) return false;
 			if (m_buildRangeInfo.instanceCount != m_instances->size()) return false;
+			// https://registry.khronos.org/vulkan/specs/latest/man/html/VkAccelerationStructureBuildRangeInfoKHR.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03660
 			if (m_buildRangeInfo.instanceByteOffset % 16 != 0) return false;
 			return true;
 		}

From ef2ed3ac6b199541fc6f782831f825bc266391db Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 15:39:46 +0700
Subject: [PATCH 319/346] Rename getSpecInfoVec to getSpecInfoVector

---
 include/nbl/asset/ICPURayTracingPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 955275f819..8e6bdaf8b9 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -62,7 +62,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return base_t::getSpecInfos(stage);
         }
 
-        inline core::vector<SShaderSpecInfo>* getSpecInfoVec(hlsl::ShaderStage stage)
+        inline core::vector<SShaderSpecInfo>* getSpecInfoVector(hlsl::ShaderStage stage)
         {
             if (!isMutable()) return nullptr;
             switch (stage) 

From 9c8792594e9588e22450030a9c34bbaa4728924e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 16:27:26 +0700
Subject: [PATCH 320/346] Fix indentation

---
 include/nbl/asset/IAsset.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index dc77931c25..b35981ffc7 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -169,10 +169,10 @@ class IAsset : virtual public core::IReferenceCounted
     inline void visitDependents(std::function<bool(IAsset*)> visit)
     {
         assert(isMutable());
-				visitDependents([&](const IAsset* dependent) -> bool
-				{
-						return visit(const_cast<IAsset*>(dependent));
-				});
+        visitDependents([&](const IAsset* dependent) -> bool
+        {
+            return visit(const_cast<IAsset*>(dependent));
+        });
     }
 
 		virtual bool valid() const = 0;

From 697589ccada856f55b459fe57e7e8a2e9f3f0371 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 16:33:06 +0700
Subject: [PATCH 321/346] Remove virtual from final classes

---
 include/nbl/asset/ICPUAccelerationStructure.h         | 8 ++++----
 include/nbl/asset/ICPUAnimationLibrary.h              | 4 ++--
 include/nbl/asset/ICPUBuffer.h                        | 4 ++--
 include/nbl/asset/ICPUBufferView.h                    | 4 ++--
 include/nbl/asset/ICPUComputePipeline.h               | 2 +-
 include/nbl/asset/ICPUDescriptorSet.h                 | 4 ++--
 include/nbl/asset/ICPUDescriptorSetLayout.h           | 4 ++--
 include/nbl/asset/ICPUGraphicsPipeline.h              | 6 +++---
 include/nbl/asset/ICPUImage.h                         | 4 ++--
 include/nbl/asset/ICPUImageView.h                     | 4 ++--
 include/nbl/asset/ICPUMesh.h                          | 8 ++++----
 include/nbl/asset/ICPUMeshBuffer.h                    | 4 ++--
 include/nbl/asset/ICPUPipelineCache.h                 | 4 ++--
 include/nbl/asset/ICPUPipelineLayout.h                | 4 ++--
 include/nbl/asset/ICPURayTracingPipeline.h            | 6 +++---
 include/nbl/asset/ICPURenderpass.h                    | 4 ++--
 include/nbl/asset/ICPURenderpassIndependentPipeline.h | 4 ++--
 include/nbl/asset/ICPUSampler.h                       | 4 ++--
 include/nbl/asset/ICPUSkeleton.h                      | 4 ++--
 19 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index feddcbb35f..a4f1e9dec4 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -231,7 +231,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 			return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms;
 		}
 
-		inline virtual bool valid() const override
+		inline bool valid() const override
 		{
 			if (!validBuildFlags(m_buildFlags)) return false;
 
@@ -275,7 +275,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		core::smart_refctd_dynamic_array<uint32_t> m_geometryPrimitiveCount = nullptr;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT;
 
-		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
 };
 
 class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure
@@ -379,7 +379,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 			return cp;
 		}
 
-		inline virtual bool valid() const override
+		inline bool valid() const override
 		{
 			if (!validBuildFlags(m_buildFlags)) return false;
 			if (!m_instances) return false;
@@ -399,7 +399,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT;
 
-		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!m_instances) return;
         for (const auto& instance : *m_instances)
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index bcaae3bf3e..321cefa33b 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -95,11 +95,11 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 
 		constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY;
 		inline E_TYPE getAssetType() const override { return AssetType; }
-		inline virtual bool valid() const override { return true; }
+		inline bool valid() const override { return true; }
 
   private:
 
-		virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(m_keyframeStorageBinding.buffer.get())) return;
         if (!visit(m_timestampStorageBinding.buffer.get())) return;
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 66170ac20d..46105b3c0e 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -110,7 +110,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
             return true;
         }
 
-        inline virtual bool valid() const override
+        inline bool valid() const override
         {
             if (!m_data) return false;
             if (!m_mem_resource) return false;
@@ -137,7 +137,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         discardContent_impl();
     }
 
-    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
 
     void* m_data;
     core::smart_refctd_ptr<core::refctd_memory_resource> m_mem_resource;
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index c96f0377f4..8634fd8394 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -46,7 +46,7 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 			m_size = _size;
 		}
 
-    inline virtual bool valid() const override
+    inline bool valid() const override
     {
         if (!m_buffer->valid()) return false;
         if (m_offset >= m_buffer->getSize()) return false;
@@ -61,7 +61,7 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 
   private:
 
-		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
         if (!visit(m_buffer.get())) return;
 		}
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 02b56d02ce..9b867e3a06 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -88,7 +88,7 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
           base_t(layout, {})
           {}
         
-        virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
             if (!visit(m_layout.get())) return;
             if (!visit(m_specInfo.shader.get())) return;
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 776e4e1409..29cfe4cb1d 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -77,7 +77,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override;
 
-		inline virtual bool valid() const override {
+		inline bool valid() const override {
 			if (!m_layout->valid()) return false;
 			return true;
 		}
@@ -90,7 +90,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_dynamic_array<ICPUDescriptorSet::SDescriptorInfo> m_descriptorInfos[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT)];
 
-    virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
         {
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index da249620bc..a46bb55808 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -56,7 +56,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
 
         constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
-        inline virtual bool valid() const override
+        inline bool valid() const override
         {
             return true; // no modification is possible after creation
         }
@@ -67,7 +67,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
       
   private:
 
-      inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+      inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
       {
           if (!m_immutableSamplers) return;
           for (const auto& sampler : *m_immutableSamplers)
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index f39f38f673..a95a82633c 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -40,7 +40,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return m_params;
         }
 
-        inline virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
         {
             const auto stageIndex = stageToIndex(stage);
             if (stageIndex != -1)
@@ -70,7 +70,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return nullptr;
         }
 
-        inline virtual bool valid() const override final
+        inline bool valid() const override final
         {
             if (!m_layout) return false;
             if (!m_layout->valid())return false;
@@ -127,7 +127,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
 
-        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
             if (!visit(m_layout.get())) return;
             if (!visit(m_renderpass.get())) return;
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index 01ee3d41e0..13cbb7ecec 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -195,7 +195,7 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 			return true;
 		}
 
-	  inline virtual bool valid() const override
+	  inline bool valid() const override
 		{
 			if (!validateCreationParameters(m_creationParams)) return false;
 			if (info != m_creationParams.format) return false;
@@ -228,7 +228,7 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 			}
 		};
 
-    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
     }
 };
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 953651c604..85a0629cc3 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -62,7 +62,7 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 			params.subresourceRange.aspectMask = aspect.value;
 		}
 
-    inline virtual bool valid() const override
+    inline bool valid() const override
 		{
 			if (!validateCreationParameters(params)) return false;
 
@@ -78,7 +78,7 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 
   private:
 
-    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(params.image.get())) return;
     }
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index 0f780ef437..df647b14a4 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -81,7 +81,7 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
             return cp;
         }
 
-    inline virtual bool valid() const override
+    inline bool valid() const override
     {
       for (const auto& meshBuffer : m_meshBuffers)
       {
@@ -96,9 +96,9 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
 	private:
 		core::vector<core::smart_refctd_ptr<ICPUMeshBuffer>> m_meshBuffers;
 
-        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
-        {
-        }
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 6bd3cd5700..aa6cbc9429 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -610,13 +610,13 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             assert(isMutable());
             return const_cast<core::aabbox3df*>(const_cast<const ICPUMeshBuffer*>(this)->getJointAABBs());
         }
-        inline virtual bool valid() const override
+        inline bool valid() const override
         {
             return true;
         }
 
     private:
-        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
         }
 };
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 702b86620e..c5511f39bb 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -83,7 +83,7 @@ class ICPUPipelineCache final : public IPreHashed
 		//
 		const auto& getEntries() const {return m_cache;}
 
-		inline virtual bool valid() const override
+		inline bool valid() const override
 		{
 			return true;
 		}
@@ -98,7 +98,7 @@ class ICPUPipelineCache final : public IPreHashed
 	private:
 		entries_map_t m_cache;
 
-		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
 		}
 };
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index 0684980cf8..b30ecc3e10 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -66,7 +66,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
         static inline constexpr auto AssetType = ET_PIPELINE_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
 
-        inline virtual bool valid() const override
+        inline bool valid() const override
         {
             for (auto i = 0; i < m_descSetLayouts.size(); i++)
             {
@@ -79,7 +79,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
-      inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+      inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
       {
           for (auto i = 0; i < m_descSetLayouts.size(); i++)
           {
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 8e6bdaf8b9..2c157f91e9 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -36,7 +36,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        inline virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
         {
             switch (stage) 
             {
@@ -84,7 +84,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         }
 
 
-        inline virtual bool valid() const override final
+        inline bool valid() const override final
         {
             if (!m_layout) return false;
             if (!m_layout->valid()) return false;
@@ -116,7 +116,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             : base_t(layout, {})
             {}
 
-        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
             if (!visit(m_raygen.shader.get()) return;
             for (const auto& missInfo : self->m_misses) if (!visit(missInfo.shader.get())) return;
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index a131b44add..daaa5c62b0 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -38,7 +38,7 @@ class ICPURenderpass : public IRenderpass, public IAsset
             return ET_RENDERPASS;
         }
 
-        inline virtual bool valid() const override
+        inline bool valid() const override
         {
             // no modification is possible after creation. parameter is validated when creating renderpass
             return true;
@@ -50,7 +50,7 @@ class ICPURenderpass : public IRenderpass, public IAsset
 
     private:
 
-        inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
         }
 
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index b349aab888..3d67af23d0 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -93,7 +93,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 			m_layout = std::move(_layout);
 		}
 
-    inline virtual bool valid() const override
+    inline bool valid() const override
     {
       return m_layout && m_layout->valid();
     }
@@ -155,7 +155,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 
   private:
 
-    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
     }
 };
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index 8db568f26b..6b2bea5219 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -68,11 +68,11 @@ class ICPUSampler : public ISampler, public IAsset
 
 		constexpr static inline auto AssetType = ET_SAMPLER;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
-		inline virtual bool valid() const override { return true; }
+		inline bool valid() const override { return true; }
 		
   private:
 
-    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
     }
 };
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 361468d1c5..1049798268 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -78,11 +78,11 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 
 		constexpr static inline auto AssetType = ET_SKELETON;
 		inline E_TYPE getAssetType() const override { return AssetType; }
-		inline virtual bool valid() const override { return true; }
+		inline bool valid() const override { return true; }
 
   private:
 
-		inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
         if (!visit(m_defaultTransforms.buffer.get())) return;
 				if (!visit(m_parentJointIDs.buffer.get())) return;

From 469bf0419ccf56cf225fae1ae03d290794ff1f18 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 16:34:39 +0700
Subject: [PATCH 322/346] Fix indentation

---
 include/nbl/asset/ICPUAccelerationStructure.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index a4f1e9dec4..a6b148a891 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -275,7 +275,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		core::smart_refctd_dynamic_array<uint32_t> m_geometryPrimitiveCount = nullptr;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT;
 
-		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
 };
 
 class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure
@@ -399,11 +399,11 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT;
 
-		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
-        if (!m_instances) return;
-        for (const auto& instance : *m_instances)
-            if (!visit(instance.getBase().blas.get())) return;
+      if (!m_instances) return;
+      for (const auto& instance : *m_instances)
+        if (!visit(instance.getBase().blas.get())) return;
     }
 };
 

From 6e23e6e76c4f29dae8d02584f82423cbb3c3cdcf Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:03:27 +0700
Subject: [PATCH 323/346] Fix indentation

---
 include/nbl/asset/ICPUImage.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index 13cbb7ecec..fdbf640557 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -195,16 +195,16 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 			return true;
 		}
 
-	  inline bool valid() const override
-		{
-			if (!validateCreationParameters(m_creationParams)) return false;
-			if (info != m_creationParams.format) return false;
-			if (buffer && !buffer->valid()) return false;
-			if (regions)
-				for (const auto& region : *regions)
-					if (!region.isValid()) return false;
-			return true;
-		}
+    inline bool valid() const override
+    {
+      if (!validateCreationParameters(m_creationParams)) return false;
+      if (info != m_creationParams.format) return false;
+      if (buffer && !buffer->valid()) return false;
+      if (regions)
+        for (const auto& region : *regions)
+          if (!region.isValid()) return false;
+      return true;
+    }
 
     protected:
 		inline ICPUImage(const SCreationParams& _params) : IImage(_params) {}

From 9c138b7c281ed6610058bf80bbd27035730518f4 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:03:35 +0700
Subject: [PATCH 324/346] Fix indentation

---
 include/nbl/asset/IPreHashed.h | 104 ++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h
index 054bfaee92..f7252211e1 100644
--- a/include/nbl/asset/IPreHashed.h
+++ b/include/nbl/asset/IPreHashed.h
@@ -39,61 +39,61 @@ class IPreHashed : public IAsset
 				discardContent_impl();
 		}
 
-		static inline void discardDependantsContents(const std::span<IAsset*> roots)
-		{
-			core::stack<IAsset*> stack;
-			core::unordered_set<IAsset*> alreadyVisited; // whether we have push the node to the stack
-			auto push = [&stack,&alreadyVisited](IAsset* node) -> bool
-			{
-				const auto [dummy,inserted] = alreadyVisited.insert(node);
-				if (inserted)
-					stack.push(node);
-				return true;
-			};
-			for (const auto& root : roots)
-				push(root);
-			while (!stack.empty())
-			{
-				auto* entry = stack.top();
-				stack.pop();
-				entry->visitDependents(push);
-        // post order traversal does discard
+    static inline void discardDependantsContents(const std::span<IAsset*> roots)
+    {
+      core::vector<IAsset*> stack;
+      core::unordered_set<IAsset*> alreadyVisited; // whether we have push the node to the stack
+      auto push = [&stack,&alreadyVisited](IAsset* node) -> bool
+      {
+        const auto [dummy,inserted] = alreadyVisited.insert(node);
+        if (inserted)
+          stack.push_back(node);
+        return true;
+      };
+      for (const auto& root : roots)
+        push(root);
+      while (!stack.empty())
+      {
+        auto* entry = stack.back();
+        stack.pop_back();
+        entry->visitDependents(push);
+        // pre order traversal does discard
         auto* isPrehashed = dynamic_cast<IPreHashed*>(entry);
         if (isPrehashed)
           isPrehashed->discardContent();
-			}
-		}
-		static inline bool anyDependantDiscardedContents(const IAsset* root)
-		{
-			core::stack<const IAsset*> stack;
-			core::unordered_set<const IAsset*> alreadyVisited; // whether we have push the node to the stack
-			bool result = false;
-			auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool
-			{
-				const auto [dummy,inserted] = alreadyVisited.insert(node);
-				if (inserted)
-				{
-					auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
-					if (isPrehashed && isPrehashed->missingContent())
-					{
-						stack = {};
-						result = true;
-						return false;
-					}
-					stack.push(node);
-				}
-				return true;
-			};
-			if (!push(root))
-				return true;
-			while (!stack.empty())
-			{
-				auto* entry = stack.top();
-				stack.pop();
-				entry->visitDependents(push);
-			}
-			return result;
-		}
+      }
+    }
+    static inline bool anyDependantDiscardedContents(const IAsset* root)
+    {
+      core::vector<const IAsset*> stack;
+      core::unordered_set<const IAsset*> alreadyVisited; // whether we have push the node to the stack
+      bool result = false;
+      auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool
+      {
+        const auto [dummy,inserted] = alreadyVisited.insert(node);
+        if (inserted)
+        {
+          auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
+          if (isPrehashed && isPrehashed->missingContent())
+          {
+            stack.clear();
+            result = true;
+            return false;
+          }
+          stack.push_back(node);
+        }
+        return true;
+      };
+      if (!push(root))
+        return true;
+      while (!stack.empty())
+      {
+        auto* entry = stack.back();
+        stack.pop_back();
+        entry->visitDependents(push);
+      }
+      return result;
+    }
 
 	protected:
 		inline IPreHashed() = default;

From 026d49412acb9e77d2dcdb6b911df5f36d9db63b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:05:27 +0700
Subject: [PATCH 325/346] Fix indentation

---
 include/nbl/asset/IAsset.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index b35981ffc7..aae73fac2a 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -156,15 +156,15 @@ class IAsset : virtual public core::IReferenceCounted
 		//!
 		inline bool isMutable() const {return m_mutable;}
 
-		inline void visitDependents(std::function<bool(const IAsset*)> visit) const
-		{
-				visitDependents_impl([&visit](const IAsset* dep)->bool
+    inline void visitDependents(std::function<bool(const IAsset*)> visit) const
+    {
+        visitDependents_impl([&visit](const IAsset* dep)->bool
         {
             if (dep)
                 return visit(dep);
             return true;
         });
-		}
+    }
 
     inline void visitDependents(std::function<bool(IAsset*)> visit)
     {

From 2578abed02a426253e9cded093da1c84397eb020 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:57:16 +0700
Subject: [PATCH 326/346] Check raygen shader existence for raytracing pipeline

---
 include/nbl/video/IGPURayTracingPipeline.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 482861dbcc..56c7b38c29 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -141,6 +141,8 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                   }
                 }
 
+                if (!shaderGroups.raygen.shader) return {};
+
                 return retval;
             }
 

From 0608e9db55f41cac3328addbae56db1b1afd9af1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:57:32 +0700
Subject: [PATCH 327/346] Check vertex shader existence for graphics pipeline

---
 include/nbl/video/IGPUGraphicsPipeline.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 6b2201672b..7027252b0f 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -64,6 +64,9 @@ class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const
                 
                 if (!hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType))
                     return {};
+
+                if (!vertexShader.shader) return {};
+
                 return retval;
             }
 

From 608047577a022eb79479ff3ac203945159c31ac6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:58:34 +0700
Subject: [PATCH 328/346] Remove comment on IShader::valid()

---
 include/nbl/asset/IShader.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index d1ae2e0c86..25211f5909 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -91,7 +91,6 @@ class IShader : public IAsset
 		{
 			if (!m_code) return false;
 			if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false;
-			// Note(kevyuu) : Should we check for m_filepathHint if content type is not spirv. What if no pragma includ in the source code. Do we even need m_filepathHint in that case?
 			return true;
 		}
 

From e2f7b8f59e5010ac6eb2354a4ca9e5cfa0af6eea Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 17:59:39 +0700
Subject: [PATCH 329/346] Remove virtual on IShader::valid and
 IShader::visitDependents_impl

---
 include/nbl/asset/IShader.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 25211f5909..34b93e99c2 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -87,7 +87,7 @@ class IShader : public IAsset
 
 		// TODO: `void setContent(core::smart_refctd_ptr<const ICPUBuffer>&&,const E_CONTENT_TYPE)`
 
-		inline virtual bool valid() const override
+		inline bool valid() const override
 		{
 			if (!m_code) return false;
 			if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false;
@@ -106,7 +106,7 @@ class IShader : public IAsset
 
   private:
 
-    inline virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
     {
         if (!visit(m_code.get())) return;
     }

From 47900b1bfca77a3dcf238a913432a44b193b4e0a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 18:00:38 +0700
Subject: [PATCH 330/346] Add final to IShader

---
 include/nbl/asset/IShader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index 34b93e99c2..96ff73f3f0 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -27,7 +27,7 @@ namespace nbl::asset
 	The purpose for the class is for storing raw HLSL code to be compiled
 	or already compiled (but unspecialized) SPIR-V code.
 */
-class IShader : public IAsset
+class IShader final : public IAsset
 {
 	public:
 		enum class E_CONTENT_TYPE : uint8_t

From 029cfeb5e7f9eae3caebd572c26c47b04d7806c4 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 16 Jun 2025 18:10:02 +0700
Subject: [PATCH 331/346] improved readability for config, include all new
 files

---
 .../hlsl/workgroup2/arithmetic_config.hlsl    | 118 ++++++++----------
 .../impl/arithmetic_config_def.hlsl           |  38 +++---
 .../workgroup2/impl/items_per_invoc_def.hlsl  |   6 +-
 .../workgroup2/impl/virtual_wg_size_def.hlsl  |   4 +-
 src/nbl/builtin/CMakeLists.txt                |   3 +
 5 files changed, 78 insertions(+), 91 deletions(-)

diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
index 6eb6a535fe..9a211899cb 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -21,13 +21,11 @@ template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
     #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
-    #define DEFINE_VIRTUAL_WG_T(ID) ID
-    #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
-    #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
     #include "impl/virtual_wg_size_def.hlsl"
-    #undef DEFINE_COND_VAL
-    #undef DEFINE_MPL_MAX_V
-    #undef DEFINE_VIRTUAL_WG_T
+    #undef SELECT
+    #undef MAX
     #undef DEFINE_ASSIGN
     
     // must have at least enough level 0 outputs to feed a single subgroup
@@ -39,17 +37,15 @@ template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation>
 struct items_per_invocation
 {
     #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
-    #define DEFINE_VIRTUAL_WG_T(ID) VirtualWorkgroup::ID
-    #define DEFINE_ITEMS_INVOC_T(ID) ID
-    #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) mpl::min_v<TYPE, ARG1, ARG2>
-    #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
-    #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #define VIRTUAL_WG_SIZE VirtualWorkgroup::
+    #define MIN(TYPE,ARG1,ARG2) mpl::min_v<TYPE, ARG1, ARG2>
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
     #include "impl/items_per_invoc_def.hlsl"
-    #undef DEFINE_COND_VAL
-    #undef DEFINE_MPL_MAX_V
-    #undef DEFINE_MPL_MIN_V
-    #undef DEFINE_ITEMS_INVOC_T
-    #undef DEFINE_VIRTUAL_WG_T
+    #undef SELECT
+    #undef MAX
+    #undef MIN
+    #undef VIRTUAL_WG_SIZE
     #undef DEFINE_ASSIGN
 
     using ItemsPerInvocation = tuple<integral_constant<uint16_t,value0>,integral_constant<uint16_t,value1>,integral_constant<uint16_t,value2> >;
@@ -64,17 +60,15 @@ struct ArithmeticConfiguration
     using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation;
 
     #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
-    #define DEFINE_VIRTUAL_WG_T(ID) virtual_wg_t::ID
-    #define DEFINE_ITEMS_INVOC_T(ID) items_per_invoc_t::ID
-    #define DEFINE_CONFIG_T(ID) ID
-    #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
-    #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #define VIRTUAL_WG_SIZE virtual_wg_t::
+    #define ITEMS_PER_INVOC items_per_invoc_t::
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
     #include "impl/arithmetic_config_def.hlsl"
-    #undef DEFINE_COND_VAL
-    #undef DEFINE_MPL_MAX_V
-    #undef DEFINE_CONFIG_T
-    #undef DEFINE_ITEMS_INVOC_T
-    #undef DEFINE_VIRTUAL_WG_T
+    #undef SELECT
+    #undef MAX
+    #undef ITEMS_PER_INVOC
+    #undef VIRTUAL_WG_SIZE
     #undef DEFINE_ASSIGN
 
     using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
@@ -148,19 +142,15 @@ namespace impl
 {
 struct SVirtualWGSizeLog2
 {
-    static SVirtualWGSizeLog2 create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2)
+    void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2)
     {
-        SVirtualWGSizeLog2 retval;
-        #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__;
-        #define DEFINE_VIRTUAL_WG_T(ID) retval.ID
-        #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
-        #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
         #include "impl/virtual_wg_size_def.hlsl"
-        #undef DEFINE_COND_VAL
-        #undef DEFINE_MPL_MAX_V
-        #undef DEFINE_VIRTUAL_WG_T
+        #undef SELECT
+        #undef MAX
         #undef DEFINE_ASSIGN
-        return retval;
     }
 
     #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
@@ -170,23 +160,19 @@ struct SVirtualWGSizeLog2
 
 struct SItemsPerInvoc
 {
-    static SItemsPerInvoc create(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation)
+    void init(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation)
     {
-        SItemsPerInvoc retval;
-        #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__;
-        #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID
-        #define DEFINE_ITEMS_INVOC_T(ID) retval.ID
-        #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) hlsl::min<TYPE>(ARG1, ARG2)
-        #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
-        #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define VIRTUAL_WG_SIZE virtualWgSizeLog2.
+        #define MIN(TYPE,ARG1,ARG2) hlsl::min<TYPE>(ARG1, ARG2)
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
         #include "impl/items_per_invoc_def.hlsl"
-        #undef DEFINE_COND_VAL
-        #undef DEFINE_MPL_MAX_V
-        #undef DEFINE_MPL_MIN_V
-        #undef DEFINE_ITEMS_INVOC_T
-        #undef DEFINE_VIRTUAL_WG_T
+        #undef SELECT
+        #undef MAX
+        #undef MIN
+        #undef VIRTUAL_WG_SIZE
         #undef DEFINE_ASSIGN
-        return retval;
     }
 
     #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
@@ -197,26 +183,24 @@ struct SItemsPerInvoc
 
 struct SArithmeticConfiguration
 {
-    static SArithmeticConfiguration create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation)
+    void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation)
     {
-        impl::SVirtualWGSizeLog2 virtualWgSizeLog2 = impl::SVirtualWGSizeLog2::create(_WorkgroupSizeLog2, _SubgroupSizeLog2);
-        impl::SItemsPerInvoc itemsPerInvoc = impl::SItemsPerInvoc::create(virtualWgSizeLog2, _ItemsPerInvocation);
-
-        SArithmeticConfiguration retval;
-        #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__;
-        #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID
-        #define DEFINE_ITEMS_INVOC_T(ID) itemsPerInvoc.ID
-        #define DEFINE_CONFIG_T(ID) retval.ID
-        #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
-        #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        impl::SVirtualWGSizeLog2 virtualWgSizeLog2;
+        virtualWgSizeLog2.init(_WorkgroupSizeLog2, _SubgroupSizeLog2);
+        impl::SItemsPerInvoc itemsPerInvoc;
+        itemsPerInvoc.init(virtualWgSizeLog2, _ItemsPerInvocation);
+
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define VIRTUAL_WG_SIZE virtualWgSizeLog2.
+        #define ITEMS_PER_INVOC itemsPerInvoc.
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
         #include "impl/arithmetic_config_def.hlsl"
-        #undef DEFINE_COND_VAL
-        #undef DEFINE_MPL_MAX_V
-        #undef DEFINE_CONFIG_T
-        #undef DEFINE_ITEMS_INVOC_T
-        #undef DEFINE_VIRTUAL_WG_T
+        #undef SELECT
+        #undef MAX
+        #undef ITEMS_PER_INVOC
+        #undef VIRTUAL_WG_SIZE
         #undef DEFINE_ASSIGN
-        return retval;
     }
 
     #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
index 4ea6fc010d..94f54409db 100644
--- a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
@@ -3,32 +3,32 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
-DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(WorkgroupSizeLog2))
+DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << WorkgroupSizeLog2)
 DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
-DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(SubgroupSizeLog2))
+DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << SubgroupSizeLog2)
 
-DEFINE_ASSIGN(uint16_t, LevelCount, DEFINE_VIRTUAL_WG_T(levels))
-DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << DEFINE_VIRTUAL_WG_T(value))
+DEFINE_ASSIGN(uint16_t, LevelCount, VIRTUAL_WG_SIZE levels)
+DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << VIRTUAL_WG_SIZE value)
 
-DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, DEFINE_ITEMS_INVOC_T(value0))
-DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, DEFINE_ITEMS_INVOC_T(value1))
-DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, DEFINE_ITEMS_INVOC_T(value2))
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, ITEMS_PER_INVOC value0)
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, ITEMS_PER_INVOC value1)
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, ITEMS_PER_INVOC value2)
 
-DEFINE_ASSIGN(uint16_t, LevelInputCount_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),
-    DEFINE_MPL_MAX_V(uint16_t, (DEFINE_CONFIG_T(VirtualWorkgroupSize)>>DEFINE_CONFIG_T(SubgroupSizeLog2)), DEFINE_CONFIG_T(SubgroupSize)),
-    DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1)))
-DEFINE_ASSIGN(uint16_t, LevelInputCount_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_2),0))
-DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, DEFINE_CONFIG_T(LevelInputCount_1) / DEFINE_CONFIG_T(ItemsPerInvocation_1))
+DEFINE_ASSIGN(uint16_t, LevelInputCount_1, SELECT(uint16_t,(LevelCount==3),
+    MAX(uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize),
+    SubgroupSize*ItemsPerInvocation_1))
+DEFINE_ASSIGN(uint16_t, LevelInputCount_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize*ItemsPerInvocation_2,0))
+DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, LevelInputCount_1 / ItemsPerInvocation_1)
 
-DEFINE_ASSIGN(uint16_t, __padding, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)-1,0))
-DEFINE_ASSIGN(uint16_t, __channelStride_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(VirtualInvocationsAtLevel1),DEFINE_CONFIG_T(SubgroupSize)) + DEFINE_CONFIG_T(__padding))
-DEFINE_ASSIGN(uint16_t, __channelStride_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize),0))
+DEFINE_ASSIGN(uint16_t, __padding, SELECT(uint16_t,(LevelCount==3),SubgroupSize-1,0))
+DEFINE_ASSIGN(uint16_t, __channelStride_1, SELECT(uint16_t,(LevelCount==3),VirtualInvocationsAtLevel1,SubgroupSize) + __padding)
+DEFINE_ASSIGN(uint16_t, __channelStride_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize,0))
 
 // user specified the shared mem size of Scalars
-DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==1),
+DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, SELECT(uint16_t,(LevelCount==1),
     0,
-    DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),
-        DEFINE_CONFIG_T(LevelInputCount_2)+(DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1))-1,
+    SELECT(uint16_t,(LevelCount==3),
+        LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1,
         0
-        ) + DEFINE_CONFIG_T(LevelInputCount_1)
+        ) + LevelInputCount_1
     ))
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
index 857b64d774..c32d7ef8bd 100644
--- a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, DEFINE_MPL_MAX_V(int16_t,DEFINE_VIRTUAL_WG_T(WorkgroupSizeLog2)-DEFINE_VIRTUAL_WG_T(SubgroupSizeLog2)*DEFINE_VIRTUAL_WG_T(levels),0))
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, MAX(int16_t,VIRTUAL_WG_SIZE WorkgroupSizeLog2-VIRTUAL_WG_SIZE SubgroupSizeLog2*VIRTUAL_WG_SIZE levels,0))
 DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation)
-DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << DEFINE_COND_VAL(uint16_t,(DEFINE_VIRTUAL_WG_T(levels)==3),DEFINE_MPL_MIN_V(uint16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2),2),DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2)))
-DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << DEFINE_MPL_MAX_V(int16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2)-2,0))
\ No newline at end of file
+DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << SELECT(uint16_t,(VIRTUAL_WG_SIZE levels==3),MIN(uint16_t,ItemsPerInvocationProductLog2,2),ItemsPerInvocationProductLog2))
+DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << MAX(int16_t,ItemsPerInvocationProductLog2-2,0))
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
index 3190ba5df3..e4c4047f1d 100644
--- a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
@@ -4,5 +4,5 @@
 
 DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
 DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
-DEFINE_ASSIGN(uint16_t, levels, DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1))
-DEFINE_ASSIGN(uint16_t, value, DEFINE_MPL_MAX_V(uint16_t, _SubgroupSizeLog2*DEFINE_VIRTUAL_WG_T(levels), _WorkgroupSizeLog2))
+DEFINE_ASSIGN(uint16_t, levels, SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1))
+DEFINE_ASSIGN(uint16_t, value, MAX(uint16_t, _SubgroupSizeLog2*levels, _WorkgroupSizeLog2))
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index d051c2153b..a3d15744a7 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -347,6 +347,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl")
 #workgroup2
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/virtual_wg_size_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/items_per_invoc_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/arithmetic_config_def.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl")
 #Extensions

From 9b340a4df6627b3abd3950312c8629d9c1782fb8 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 16 Jun 2025 13:29:25 +0200
Subject: [PATCH 332/346] set the `examples_tests` submodule back to `master`
 HEAD as workgroup scan example not ready yet

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 4c10dc1cdb..e30938c261 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 4c10dc1cdba4ab12dfedef97768aa4a10e606213
+Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233

From 5d990a3698ee69e57aad41376e2c445f18197816 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 11:09:14 +0700
Subject: [PATCH 333/346] Rename ISPIRVDebloater to ISPIRVEntryPointTrimmer

---
 ...VDebloater.h => ISPIRVEntryPointTrimmer.h} |  18 +--
 include/nbl/video/ILogicalDevice.h            |   4 +-
 src/nbl/CMakeLists.txt                        |   2 +-
 ...loater.cpp => ISPIRVEntryPointTrimmer.cpp} |  26 ++--
 src/nbl/video/CVulkanLogicalDevice.cpp        |   2 +-
 src/nbl/video/ILogicalDevice.cpp              | 124 +++++++++---------
 6 files changed, 88 insertions(+), 88 deletions(-)
 rename include/nbl/asset/utils/{ISPIRVDebloater.h => ISPIRVEntryPointTrimmer.h} (72%)
 rename src/nbl/asset/utils/{ISPIRVDebloater.cpp => ISPIRVEntryPointTrimmer.cpp} (91%)

diff --git a/include/nbl/asset/utils/ISPIRVDebloater.h b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h
similarity index 72%
rename from include/nbl/asset/utils/ISPIRVDebloater.h
rename to include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h
index f5f87956be..a2e24dabab 100644
--- a/include/nbl/asset/utils/ISPIRVDebloater.h
+++ b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_
-#define _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_
+#ifndef _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_
+#define _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_
 
 #include "nbl/core/declarations.h"
 
@@ -10,14 +10,14 @@
 namespace nbl::asset
 {
 
-class ISPIRVDebloater final : public core::IReferenceCounted
+class ISPIRVEntryPointTrimmer final : public core::IReferenceCounted
 {
     public:
-        ISPIRVDebloater();
+        ISPIRVEntryPointTrimmer();
 
         struct Result
         {
-            core::smart_refctd_ptr<ICPUBuffer> spirv; // nullptr if there is some entry point not found or spirv does not need to be debloated
+            core::smart_refctd_ptr<ICPUBuffer> spirv; // nullptr if there is some entry point not found or spirv does not need to be trimmed
             bool isSuccess;
 
             inline operator bool() const
@@ -45,9 +45,9 @@ class ISPIRVDebloater final : public core::IReferenceCounted
             }
         };
 
-        Result debloat(const ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const;
+        Result trim(const ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const;
 
-        inline core::smart_refctd_ptr<const IShader> debloat(const IShader* shader, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const
+        inline core::smart_refctd_ptr<const IShader> trim(const IShader* shader, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const
         {
             if (shader->getContentType() != IShader::E_CONTENT_TYPE::ECT_SPIRV)
             {
@@ -55,10 +55,10 @@ class ISPIRVDebloater final : public core::IReferenceCounted
                 return nullptr;
             }
             const auto buffer = shader->getContent();
-            const auto result = debloat(buffer, entryPoints, logger);
+            const auto result = trim(buffer, entryPoints, logger);
             if (result && result.spirv.get() == nullptr)
             {
-                // when debloat does not happen return original shader
+                // when trim does not happen return original shader
                 return core::smart_refctd_ptr<const IShader>(shader);
             }
 
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index d8ef2bdef1..def3ee0979 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -3,7 +3,7 @@
 
 #include "nbl/asset/asset.h"
 #include "nbl/asset/utils/ISPIRVOptimizer.h"
-#include "nbl/asset/utils/ISPIRVDebloater.h"
+#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h"
 #include "nbl/asset/utils/CCompilerSet.h"
 
 #include "nbl/video/SPhysicalDeviceFeatures.h"
@@ -1315,7 +1315,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             uint16_t firstQueueIndex = 0u;
         };
         const std::array<QueueFamilyInfo,MaxQueueFamilies> m_queueFamilyInfos;
-        core::smart_refctd_ptr<asset::ISPIRVDebloater> m_spirvDebloater;
+        core::smart_refctd_ptr<asset::ISPIRVEntryPointTrimmer> m_spirvTrimmer;
         
     private:
         const SPhysicalDeviceLimits& getPhysicalDeviceLimits() const;
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index b484464fb3..2dddc74f77 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -162,7 +162,7 @@ set(NBL_ASSET_SOURCES
 	
 # Shaders
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVOptimizer.cpp
-	${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVDebloater.cpp
+	${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/IShaderCompiler.cpp
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/CGLSLCompiler.cpp
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/CHLSLCompiler.cpp
diff --git a/src/nbl/asset/utils/ISPIRVDebloater.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
similarity index 91%
rename from src/nbl/asset/utils/ISPIRVDebloater.cpp
rename to src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
index f05e9d70f5..981133536d 100644
--- a/src/nbl/asset/utils/ISPIRVDebloater.cpp
+++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
@@ -1,4 +1,4 @@
-#include "nbl/asset/utils/ISPIRVDebloater.h"
+#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h"
 #include "nbl/asset/utils/ISPIRVOptimizer.h"
 #include "nbl_spirv_cross/spirv.hpp"
 
@@ -10,7 +10,7 @@ using namespace nbl::asset;
 
 static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSAL_1_6;
 
-ISPIRVDebloater::ISPIRVDebloater()
+ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer()
 {
     constexpr auto optimizationPasses = std::array{
         ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
@@ -78,7 +78,7 @@ static bool validate(const uint32_t* binary, uint32_t binarySize, nbl::system::l
     return core.Validate(binary, binarySize, validatorOptions);
 }
 
-ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger) const
+ISPIRVEntryPointTrimmer::Result ISPIRVEntryPointTrimmer::trim(const  ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger) const
 {
     const auto* spirv = static_cast<const uint32_t*>(spirvBuffer->getPointer());
     const auto spirvDwordCount = spirvBuffer->getSize() / 4;
@@ -134,7 +134,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
     std::vector<uint32_t> minimizedSpirv;
     core::unordered_set<uint32_t> removedEntryPointIds;
 
-    bool needDebloat = false;
+    bool needtrim = false;
     auto offset = HEADER_SIZE;
     auto parse_instruction = [](uint32_t instruction) -> std::tuple<uint32_t, uint32_t>
     {
@@ -185,16 +185,16 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
             foundEntryPoint += 1; // a valid spirv will have unique entry points, so this should works
         } else
         {
-            if (needDebloat == false)
+            if (needtrim == false)
             {
                 minimizedSpirv.reserve(spirvDwordCount);
                 minimizedSpirv.insert(minimizedSpirv.end(), spirv, spirv + curOffset);
-                needDebloat = true;
+                needtrim = true;
             }
             removedEntryPointIds.insert(curEntryPointId);
             continue;
         }
-        if (!needDebloat) continue;
+        if (!needtrim) continue;
         minimizedSpirv.insert(minimizedSpirv.end(), spirv + curOffset, spirv + offset);
     }
 
@@ -208,7 +208,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
         };
     }
 
-    if (!needDebloat)
+    if (!needtrim)
     {
         return {
             .spirv = nullptr,
@@ -236,22 +236,22 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
 
     assert(validate(minimizedSpirv.data(), minimizedSpirv.size(), logger));
 
-    auto debloatedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger);
+    auto trimmedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger);
 
 #ifdef _NBL_DEBUG
     logger.log("Before stripping capabilities:", nbl::system::ILogger::ELL_DEBUG);
     printCapabilities(spirv, spirvDwordCount, logger);
     logger.log("\n", nbl::system::ILogger::ELL_DEBUG);
 
-    const auto* debloatedSpirvBuffer = static_cast<const uint32_t*>(debloatedSpirv->getPointer());
-    const auto debloatedSpirvDwordCount = debloatedSpirv->getSize() / 4;
+    const auto* trimmedSpirvBuffer = static_cast<const uint32_t*>(trimmedSpirv->getPointer());
+    const auto trimmedSpirvDwordCount = trimmedSpirv->getSize() / 4;
     logger.log("After stripping capabilities:", nbl::system::ILogger::ELL_DEBUG);
-    printCapabilities(debloatedSpirvBuffer, debloatedSpirvDwordCount, logger);
+    printCapabilities(trimmedSpirvBuffer, trimmedSpirvDwordCount, logger);
     logger.log("\n", nbl::system::ILogger::ELL_DEBUG);
 #endif
 
     return {
-      .spirv = std::move(debloatedSpirv),
+      .spirv = std::move(trimmedSpirv),
       .isSuccess = true,
     };
     
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 89f7ab1da3..9757182bcc 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1,6 +1,6 @@
 #include "nbl/video/CVulkanLogicalDevice.h"
 
-#include "nbl/asset/utils/ISPIRVDebloater.h"
+#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h"
 #include "nbl/video/CThreadSafeQueueAdapter.h"
 #include "nbl/video/surface/CSurfaceVulkan.h"
 
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 19dc001d8f..225a33bec3 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -7,17 +7,17 @@
 using namespace nbl;
 using namespace nbl::video;
 
-class SpirvDebloatTask
+class SpirvTrimTask
 {
     public:
-        using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
+        using EntryPoints = core::set<asset::ISPIRVEntryPointTrimmer::EntryPoint>;
         struct ShaderInfo
         {
             EntryPoints entryPoints;
-            const asset::IShader* debloatedShaders;
+            const asset::IShader* trimmedShaders;
         };
 
-        SpirvDebloatTask(asset::ISPIRVDebloater* debloater, system::logger_opt_ptr logger) : m_debloater(debloater), m_logger(logger)
+        SpirvTrimTask(asset::ISPIRVEntryPointTrimmer* trimer, system::logger_opt_ptr logger) : m_trimmer(trimer), m_logger(logger)
         {
           
         }
@@ -31,39 +31,39 @@ class SpirvDebloatTask
             it->second.entryPoints.insert({ .name = shaderSpec.entryPoint, .stage = stage });
         }
 
-        IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders)
+        IGPUPipelineBase::SShaderSpecInfo trim(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders)
         {
             const auto* shader = shaderSpec.shader;
             auto findResult = m_shaderInfoMap.find(shader);
             assert(findResult != m_shaderInfoMap.end());
             const auto& entryPoints = findResult->second.entryPoints;
-            auto& debloatedShader = findResult->second.debloatedShaders;
+            auto& trimmedShader = findResult->second.trimmedShaders;
 
-            auto debloatedShaderSpec = shaderSpec;
+            auto trimmedShaderSpec = shaderSpec;
             if (shader != nullptr)
             {
-                if (debloatedShader == nullptr)
+                if (trimmedShader == nullptr)
                 {
                     const auto outShadersData = outShaders.data();
-                    outShaders.push_back(m_debloater->debloat(shader, entryPoints, m_logger));
+                    outShaders.push_back(m_trimmer->trim(shader, entryPoints, m_logger));
                     assert(outShadersData == outShaders.data());
-                    debloatedShader = outShaders.back().get();
+                    trimmedShader = outShaders.back().get();
                 }
-                debloatedShaderSpec.shader = debloatedShader;
+                trimmedShaderSpec.shader = trimmedShader;
             }
-            return debloatedShaderSpec;
+            return trimmedShaderSpec;
         }
   
     private:
         core::map<const asset::IShader*, ShaderInfo> m_shaderInfoMap;
-        asset::ISPIRVDebloater* m_debloater;
+        asset::ISPIRVEntryPointTrimmer* m_trimmer;
         const system::logger_opt_ptr m_logger;
 };
 
 ILogicalDevice::ILogicalDevice(core::smart_refctd_ptr<const IAPIConnection>&& api, const IPhysicalDevice* const physicalDevice, const SCreationParams& params, const bool runningInRenderdoc)
     : m_api(api), m_physicalDevice(physicalDevice), m_enabledFeatures(params.featuresToEnable), m_compilerSet(params.compilerSet),
     m_logger(m_physicalDevice->getDebugCallback() ? m_physicalDevice->getDebugCallback()->getLogger() : nullptr),
-    m_spirvDebloater(core::make_smart_refctd_ptr<asset::ISPIRVDebloater>())
+    m_spirvTrimmer(core::make_smart_refctd_ptr<asset::ISPIRVEntryPointTrimmer>())
 {
     {
         uint32_t qcnt = 0u;
@@ -805,18 +805,18 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     core::vector<IGPUComputePipeline::SCreationParams> newParams(params.begin(), params.end());
     const auto shaderCount = params.size();
     
-    core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    debloatedShaders.reserve(shaderCount);
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+    trimmedShaders.reserve(shaderCount);
 
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
 
-        const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} };
-        debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger));
-        auto debloatedShaderSpec = ci.shader;
-        debloatedShaderSpec.shader = debloatedShaders.back().get();
-        newParams[ix].shader = debloatedShaderSpec;
+        const core::set entryPoints = { asset::ISPIRVEntryPointTrimmer::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} };
+        trimmedShaders.push_back(m_spirvTrimmer->trim(ci.shader.shader, entryPoints, m_logger));
+        auto trimmedShaderSpec = ci.shader;
+        trimmedShaderSpec.shader = trimmedShaders.back().get();
+        newParams[ix].shader = trimmedShaderSpec;
     }
 
     createComputePipelines_impl(pipelineCache,newParams,output,specConstantValidation);
@@ -856,8 +856,8 @@ bool ILogicalDevice::createGraphicsPipelines(
     {
         return sum + param.getShaderCount();
     });
-    core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    debloatedShaders.reserve(shaderCount);
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+    trimmedShaders.reserve(shaderCount);
 
     for (auto ix = 0u; ix < params.size(); ix++)
     {
@@ -973,18 +973,18 @@ bool ILogicalDevice::createGraphicsPipelines(
             }
         }
 
-        SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger);
-        debloatTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
-        debloatTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
-        debloatTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
-        debloatTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
-        debloatTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
+        SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger);
+        trimTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
+        trimTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
+        trimTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
+        trimTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
+        trimTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
         
-        newParams[ix].vertexShader = debloatTask.debloat(ci.vertexShader, debloatedShaders);
-        newParams[ix].tesselationControlShader = debloatTask.debloat(ci.tesselationControlShader, debloatedShaders);
-        newParams[ix].tesselationEvaluationShader = debloatTask.debloat(ci.tesselationEvaluationShader, debloatedShaders);
-        newParams[ix].geometryShader = debloatTask.debloat(ci.geometryShader, debloatedShaders);
-        newParams[ix].fragmentShader = debloatTask.debloat(ci.fragmentShader, debloatedShaders);
+        newParams[ix].vertexShader = trimTask.trim(ci.vertexShader, trimmedShaders);
+        newParams[ix].tesselationControlShader = trimTask.trim(ci.tesselationControlShader, trimmedShaders);
+        newParams[ix].tesselationEvaluationShader = trimTask.trim(ci.tesselationEvaluationShader, trimmedShaders);
+        newParams[ix].geometryShader = trimTask.trim(ci.geometryShader, trimmedShaders);
+        newParams[ix].fragmentShader = trimTask.trim(ci.fragmentShader, trimmedShaders);
     }
 
     createGraphicsPipelines_impl(pipelineCache, newParams, output, specConstantValidation);
@@ -1074,8 +1074,8 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
         return sum + param.shaderGroups.getCallableShaderCount();
     });
     const auto shaderCount = raygenCount + missShaderCount + hitShaderCount + callableShaderCount;
-    core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    debloatedShaders.reserve(shaderCount);
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+    trimmedShaders.reserve(shaderCount);
 
     const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
     {
@@ -1091,12 +1091,12 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
     });
 
 
-    core::vector<IGPUPipelineBase::SShaderSpecInfo> debloatedMissSpecs(missGroupCount);
-    auto debloatedMissSpecData = debloatedMissSpecs.data();
-    core::vector<IGPURayTracingPipeline::SHitGroup> debloatedHitSpecs(hitGroupCount);
-    auto debloatedHitSpecData = debloatedHitSpecs.data();
-    core::vector<IGPUPipelineBase::SShaderSpecInfo> debloatedCallableSpecs(callableGroupCount);
-    auto debloatedCallableSpecData = debloatedCallableSpecs.data();
+    core::vector<IGPUPipelineBase::SShaderSpecInfo> trimmedMissSpecs(missGroupCount);
+    auto trimmedMissSpecData = trimmedMissSpecs.data();
+    core::vector<IGPURayTracingPipeline::SHitGroup> trimmedHitSpecs(hitGroupCount);
+    auto trimmedHitSpecData = trimmedHitSpecs.data();
+    core::vector<IGPUPipelineBase::SShaderSpecInfo> trimmedCallableSpecs(callableGroupCount);
+    auto trimmedCallableSpecData = trimmedCallableSpecs.data();
 
     const auto& limits = getPhysicalDeviceLimits();
     for (auto ix = 0u; ix < params.size(); ix++)
@@ -1111,45 +1111,45 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
             return false;
         }
 
-        SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger);
-        debloatTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN);
+        SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger);
+        trimTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN);
         for (const auto& miss : param.shaderGroups.misses)
-            debloatTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS);
+            trimTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS);
         for (const auto& hit : param.shaderGroups.hits)
         {
-            debloatTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT);
-            debloatTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT);
-            debloatTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION);
+            trimTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT);
+            trimTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT);
+            trimTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION);
         }
         for (const auto& callable : param.shaderGroups.callables)
-            debloatTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE);
+            trimTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE);
 
         newParams[ix] = param;
-        newParams[ix].shaderGroups.raygen = debloatTask.debloat(param.shaderGroups.raygen, debloatedShaders);
+        newParams[ix].shaderGroups.raygen = trimTask.trim(param.shaderGroups.raygen, trimmedShaders);
 
-        newParams[ix].shaderGroups.misses = debloatedMissSpecs;
+        newParams[ix].shaderGroups.misses = trimmedMissSpecs;
         for (const auto& miss: param.shaderGroups.misses)
         {
-            *debloatedMissSpecData = debloatTask.debloat(miss, debloatedShaders);
-            debloatedMissSpecData++;
+            *trimmedMissSpecData = trimTask.trim(miss, trimmedShaders);
+            trimmedMissSpecData++;
         }
 
-        newParams[ix].shaderGroups.hits = debloatedHitSpecs;
+        newParams[ix].shaderGroups.hits = trimmedHitSpecs;
         for (const auto& hit: param.shaderGroups.hits)
         {
-            *debloatedHitSpecData = {
-                .closestHit = debloatTask.debloat(hit.closestHit, debloatedShaders),
-                .anyHit = debloatTask.debloat(hit.anyHit, debloatedShaders),
-                .intersection = debloatTask.debloat(hit.intersection, debloatedShaders),
+            *trimmedHitSpecData = {
+                .closestHit = trimTask.trim(hit.closestHit, trimmedShaders),
+                .anyHit = trimTask.trim(hit.anyHit, trimmedShaders),
+                .intersection = trimTask.trim(hit.intersection, trimmedShaders),
             };
-            debloatedHitSpecData++;
+            trimmedHitSpecData++;
         }
 
-        newParams[ix].shaderGroups.callables = debloatedCallableSpecs;
+        newParams[ix].shaderGroups.callables = trimmedCallableSpecs;
         for (const auto& callable: param.shaderGroups.callables)
         {
-            *debloatedCallableSpecData = debloatTask.debloat(callable, debloatedShaders);
-            debloatedCallableSpecData++;
+            *trimmedCallableSpecData = trimTask.trim(callable, trimmedShaders);
+            trimmedCallableSpecData++;
         }
     }
 

From 6505cde350e4ea9a36b0ee37a64e4846157f2d68 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 14:00:15 +0700
Subject: [PATCH 334/346] Fix indentation to use tabs

---
 include/nbl/asset/ICPUDescriptorSet.h | 56 +++++++++++++--------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 29cfe4cb1d..53151068ae 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -90,34 +90,34 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_dynamic_array<ICPUDescriptorSet::SDescriptorInfo> m_descriptorInfos[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT)];
 
-    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
-    {
-        for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
-        {
-          if (!m_descriptorInfos[i]) continue;
-          const auto size = m_descriptorInfos[i]->size();
-          for (auto desc_i = 0u; desc_i < size; desc_i++)
-          {
-            auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get();
-            if (!desc) continue;
-            switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
-            {
-            case IDescriptor::EC_BUFFER:
-              if (!visit(static_cast<const ICPUBuffer*>(desc))) return;
-            case IDescriptor::EC_SAMPLER:
-              if (!visit(static_cast<const ICPUSampler*>(desc))) return;
-            case IDescriptor::EC_IMAGE:
-              if (!visit(static_cast<const ICPUImageView*>(desc))) return;
-            case IDescriptor::EC_BUFFER_VIEW:
-              if (!visit(static_cast<ICPUBufferView*>(desc))) return;
-            case IDescriptor::EC_ACCELERATION_STRUCTURE:
-              if (!visit(static_cast<ICPUTopLevelAccelerationStructure*>(desc))) return;
-            default:
-              break;
-            }
-          }
-        }
-    }
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		{
+				for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
+				{
+					if (!m_descriptorInfos[i]) continue;
+					const auto size = m_descriptorInfos[i]->size();
+					for (auto desc_i = 0u; desc_i < size; desc_i++)
+					{
+						auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get();
+						if (!desc) continue;
+						switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+						{
+						case IDescriptor::EC_BUFFER:
+							if (!visit(static_cast<const ICPUBuffer*>(desc))) return;
+						case IDescriptor::EC_SAMPLER:
+							if (!visit(static_cast<const ICPUSampler*>(desc))) return;
+						case IDescriptor::EC_IMAGE:
+							if (!visit(static_cast<const ICPUImageView*>(desc))) return;
+						case IDescriptor::EC_BUFFER_VIEW:
+							if (!visit(static_cast<ICPUBufferView*>(desc))) return;
+						case IDescriptor::EC_ACCELERATION_STRUCTURE:
+							if (!visit(static_cast<ICPUTopLevelAccelerationStructure*>(desc))) return;
+						default:
+							break;
+						}
+					}
+				}
+		}
 };
 
 }

From 11df7a6b89ae6d79c2b6b42e059daa4d069c96ce Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 14:00:48 +0700
Subject: [PATCH 335/346] Initial refinement for IDescriptorSet::valid()

---
 include/nbl/asset/ICPUDescriptorSet.h | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 53151068ae..857a437567 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -79,6 +79,34 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		inline bool valid() const override {
 			if (!m_layout->valid()) return false;
+			for (auto type_i = 0u; type_i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); type_i++)
+			{
+				const auto descriptorType = static_cast<IDescriptor::E_TYPE>(type_i);
+				const auto descriptorCategory = IDescriptor::GetTypeCategory(descriptorType);
+				const auto& descriptorRedirect = m_layout->getDescriptorRedirect(descriptorType);
+				const auto& descriptorInfoArr = m_descriptorInfos[type_i];
+
+				if (descriptorInfoArr->size() != descriptorRedirect.getTotalCount()) return false;
+
+				auto offset = 0;
+				for (auto binding_i = 0; binding_i < descriptorRedirect.getBindingCount(); binding_i++)
+				{
+					const auto storageIndex = IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t(binding_i);
+					const auto descriptorCount = descriptorRedirect.getCount(storageIndex);
+					const auto createFlags = descriptorRedirect.getCreateFlags(storageIndex);
+					const auto isPartiallyBound = !createFlags.hasFlags(IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT);
+					for (auto descriptor_i = 0; descriptor_i < descriptorCount; descriptor_i++)
+					{
+						const auto storageOffset = IDescriptorSetLayoutBase::CBindingRedirect::storage_offset_t(offset);
+						const auto& descriptorInfo = descriptorInfoArr->operator[](offset);
+
+						// partiallyBound layout can have null descriptor, otherwise not
+						if (!isPartiallyBound && !descriptorInfo.desc) return false;
+						if (descriptorInfo.desc && descriptorInfo.desc->getTypeCategory() != descriptorCategory) return false;
+					}
+				}
+			}
+
 			return true;
 		}
 

From 033c7cfbc061c8e2075b3e3fc5e5ae0ac54b39a0 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 15:17:16 +0700
Subject: [PATCH 336/346] Remove unnecessary final specifier

---
 include/nbl/asset/ICPUGraphicsPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index a95a82633c..f4583f2a37 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -70,7 +70,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return nullptr;
         }
 
-        inline bool valid() const override final
+        inline bool valid() const override
         {
             if (!m_layout) return false;
             if (!m_layout->valid())return false;

From 3cf455406a03605eba19ebb907ed9da86ef8ed11 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 15:18:29 +0700
Subject: [PATCH 337/346] Add const to hlsl::ShaderStage

---
 include/nbl/asset/ICPUComputePipeline.h    | 4 ++--
 include/nbl/asset/ICPUGraphicsPipeline.h   | 8 ++++----
 include/nbl/asset/ICPUPipeline.h           | 4 ++--
 include/nbl/asset/ICPURayTracingPipeline.h | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index 9b867e3a06..ffcf78e908 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -29,14 +29,14 @@ class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipel
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
 
-        inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override
         {
             if (stage==hlsl::ShaderStage::ESS_COMPUTE)
                 return {&m_specInfo,1};
             return {};
         }
 
-        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
         {
             return base_t::getSpecInfos(stage);
         }
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index f4583f2a37..acc990f18c 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -40,7 +40,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return m_params;
         }
 
-        inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override final
         {
             const auto stageIndex = stageToIndex(stage);
             if (stageIndex != -1)
@@ -48,12 +48,12 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return {};
         }
 
-        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
         {
             return base_t::getSpecInfos(stage);
         }
 
-        SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage)
+        SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage)
         {
             if (!isMutable()) return nullptr;
             const auto stageIndex = stageToIndex(stage);
@@ -62,7 +62,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
             return nullptr;
         }
 
-        const SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage) const
+        const SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage) const
         {
             const auto stageIndex = stageToIndex(stage);
             if (stageIndex != -1)
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 7003beeee7..e9442e0b8c 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -93,7 +93,7 @@ class ICPUPipelineBase
             }
         };
 
-        virtual std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const = 0;
+        virtual std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const = 0;
 
 };
 
@@ -132,7 +132,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe
         }
 
         // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later.
-        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
         {
             if (!isMutable()) return {};
             const this_t* constPipeline = const_cast<const this_t*>(this);
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index 2c157f91e9..f56a5f6b46 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -36,7 +36,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        inline std::span<const SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override final
         {
             switch (stage) 
             {
@@ -57,12 +57,12 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
             return {};
         }
 
-        inline std::span<SShaderSpecInfo> getSpecInfos(hlsl::ShaderStage stage)
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
         {
             return base_t::getSpecInfos(stage);
         }
 
-        inline core::vector<SShaderSpecInfo>* getSpecInfoVector(hlsl::ShaderStage stage)
+        inline core::vector<SShaderSpecInfo>* getSpecInfoVector(const hlsl::ShaderStage stage)
         {
             if (!isMutable()) return nullptr;
             switch (stage) 

From 9eab2f862aa41392b154d25ddee4cf9942438d45 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 15:18:57 +0700
Subject: [PATCH 338/346] Remove unnecessary final specifier

---
 include/nbl/asset/ICPURayTracingPipeline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
index f56a5f6b46..17c53557e1 100644
--- a/include/nbl/asset/ICPURayTracingPipeline.h
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -36,7 +36,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICP
         constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
         
-        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override final
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override
         {
             switch (stage) 
             {

From 74241f3406ee17f93caf9dd860b5df2f8392d084 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 15:19:35 +0700
Subject: [PATCH 339/346] Add comment on why we need multiple dead branch
 elimination and multiple dead function pass

---
 src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
index 981133536d..361f5d3cfa 100644
--- a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
+++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
@@ -12,6 +12,7 @@ static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSA
 
 ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer()
 {
+    // Multiple dead branch and dead function elimination because the first entry point removal might result to dead branch. Then the dead branch might result to dead function. Then, the dead function might result to dead branch and so on.
     constexpr auto optimizationPasses = std::array{
         ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
         ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,

From 04bcf0d86cfcaee56ee1df9b9aa71405b5ef9f86 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 15:19:56 +0700
Subject: [PATCH 340/346] Remove unused variable

---
 include/nbl/asset/ICPUDescriptorSet.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 857a437567..4247283c0e 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -97,7 +97,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 					const auto isPartiallyBound = !createFlags.hasFlags(IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT);
 					for (auto descriptor_i = 0; descriptor_i < descriptorCount; descriptor_i++)
 					{
-						const auto storageOffset = IDescriptorSetLayoutBase::CBindingRedirect::storage_offset_t(offset);
 						const auto& descriptorInfo = descriptorInfoArr->operator[](offset);
 
 						// partiallyBound layout can have null descriptor, otherwise not

From fc1983f3a1d2a15424a0fcce860aab27f23e4548 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 16:50:14 +0700
Subject: [PATCH 341/346] Small fixes on SpirvTrimTask

---
 include/nbl/video/IGPURayTracingPipeline.h | 33 ----------------------
 src/nbl/video/ILogicalDevice.cpp           | 23 ++-------------
 2 files changed, 3 insertions(+), 53 deletions(-)

diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index 56c7b38c29..690e6685d3 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -39,39 +39,6 @@ class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<c
                     return 1 + hits.size() + misses.size() + callables.size();
                 }
 
-                inline uint32_t getMissShaderCount() const
-                {
-                    auto count = 0; 
-                    for (const auto& miss : misses)
-                        count += (miss.shader != nullptr);
-                    return count;
-                }
-
-                inline uint32_t getHitShaderCount() const
-                {
-                    auto count = 0; 
-                    for (const auto& hit : hits)
-                    {
-                        count += (hit.closestHit.shader != nullptr);
-                        count += (hit.anyHit.shader != nullptr);
-                        count += (hit.intersection.shader != nullptr);
-                    }
-                    return count;
-                }
-
-                inline uint32_t getCallableShaderCount() const
-                {
-                    auto count = 0; 
-                    for (const auto& callable : callables)
-                        count += (callable.shader != nullptr ? 1 : 0);
-                    return count;
-                }
-
-                inline uint32_t getShaderCount() const
-                {
-                    return getMissShaderCount() + getHitShaderCount() + getCallableShaderCount();
-                }
-
             };
 
             IGPUPipelineLayout* layout = nullptr;
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 225a33bec3..983daed190 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -14,7 +14,7 @@ class SpirvTrimTask
         struct ShaderInfo
         {
             EntryPoints entryPoints;
-            const asset::IShader* trimmedShaders;
+            const asset::IShader* trimmedShader;
         };
 
         SpirvTrimTask(asset::ISPIRVEntryPointTrimmer* trimer, system::logger_opt_ptr logger) : m_trimmer(trimer), m_logger(logger)
@@ -22,7 +22,7 @@ class SpirvTrimTask
           
         }
 
-        void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage)
+        void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, const hlsl::ShaderStage stage)
         {
             const auto* shader = shaderSpec.shader;
             auto it = m_shaderInfoMap.find(shader);
@@ -37,16 +37,14 @@ class SpirvTrimTask
             auto findResult = m_shaderInfoMap.find(shader);
             assert(findResult != m_shaderInfoMap.end());
             const auto& entryPoints = findResult->second.entryPoints;
-            auto& trimmedShader = findResult->second.trimmedShaders;
+            auto& trimmedShader = findResult->second.trimmedShader;
 
             auto trimmedShaderSpec = shaderSpec;
             if (shader != nullptr)
             {
                 if (trimmedShader == nullptr)
                 {
-                    const auto outShadersData = outShaders.data();
                     outShaders.push_back(m_trimmer->trim(shader, entryPoints, m_logger));
-                    assert(outShadersData == outShaders.data());
                     trimmedShader = outShaders.back().get();
                 }
                 trimmedShaderSpec.shader = trimmedShader;
@@ -1060,22 +1058,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
     }
 
     core::vector<IGPURayTracingPipeline::SCreationParams> newParams(params.begin(), params.end());
-    const auto raygenCount = params.size(); // assume every param have raygen
-    const auto missShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
-    {
-        return sum + param.shaderGroups.getMissShaderCount();
-    });
-    const auto hitShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
-    {
-        return sum + param.shaderGroups.getHitShaderCount();
-    });
-    const auto callableShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
-    {
-        return sum + param.shaderGroups.getCallableShaderCount();
-    });
-    const auto shaderCount = raygenCount + missShaderCount + hitShaderCount + callableShaderCount;
     core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    trimmedShaders.reserve(shaderCount);
 
     const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
     {

From 3767ede47b3841a6e8982ba1674ae5555c924f0f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 16:52:44 +0700
Subject: [PATCH 342/346] Fix shader indexing logic in ray tracing pipeline
 creation

---
 src/nbl/video/CVulkanLogicalDevice.cpp | 37 +++++++++++++++++++-------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 9757182bcc..54cc6afdf1 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1516,10 +1516,28 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
 
     for (const auto& info : createInfos)
     {
-        core::unordered_map<const asset::IShader*, uint32_t> shaderIndexes;
-        auto getVkShaderIndex = [&](const asset::IShader* shader)
+        struct VkShaderStageKey
         {
-          const auto index = shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader];
+          const asset::IShader* shader;
+          std::string_view entryPoint;
+          bool operator==(const VkShaderStageKey& other) const = default;
+
+          struct HashFunction
+          {
+            size_t operator()(const VkShaderStageKey& key) const
+            {
+              size_t rowHash = std::hash<const asset::IShader*>()(key.shader);
+              size_t colHash = std::hash<std::string_view>()(key.entryPoint) << 1;
+              return rowHash ^ colHash;
+            }
+          };
+        };
+
+        core::unordered_map<VkShaderStageKey, uint32_t, VkShaderStageKey::HashFunction> shaderIndexes;
+        auto getVkShaderIndex = [&](const IGPUPipelineBase::SShaderSpecInfo& spec)
+        {
+          const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint };
+          const auto index = key.shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[key];
           return index;
         };
 
@@ -1529,7 +1547,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
                 .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
                 .pNext = nullptr,
                 .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
-                .generalShader = getVkShaderIndex(spec.shader),
+                .generalShader = getVkShaderIndex({spec.shader, spec.entryPoint}),
                 .closestHitShader = VK_SHADER_UNUSED_KHR,
                 .anyHitShader = VK_SHADER_UNUSED_KHR,
                 .intersectionShader = VK_SHADER_UNUSED_KHR,
@@ -1543,9 +1561,9 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
                 .type = group.intersection.shader == nullptr ? 
                   VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR,
                 .generalShader = VK_SHADER_UNUSED_KHR,
-                .closestHitShader = getVkShaderIndex(group.closestHit.shader),
-                .anyHitShader = getVkShaderIndex(group.anyHit.shader),
-                .intersectionShader = getVkShaderIndex(group.intersection.shader),
+                .closestHitShader = getVkShaderIndex(group.closestHit),
+                .anyHitShader = getVkShaderIndex(group.anyHit),
+                .intersectionShader = getVkShaderIndex(group.intersection),
             };
         };
 
@@ -1554,9 +1572,10 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
         auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage)
         {
             if (!spec.shader) return;
-            if (shaderIndexes.find(spec.shader) == shaderIndexes.end())
+            const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint };
+            if (shaderIndexes.find(key) == shaderIndexes.end())
             {
-                shaderIndexes.insert({ spec.shader, std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage)});
+                shaderIndexes.insert({ key , std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage)});
                 *(outShaderStage) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
                 outShaderStage++;
             }

From 061d49cccd93ba371307fa7f9f2045fcdde21219 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 18:09:38 +0700
Subject: [PATCH 343/346] Fix maxShaderStages calculation when creating ray
 tracing pipeline

---
 src/nbl/video/CVulkanLogicalDevice.cpp | 59 +++++++++++++++++---------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 54cc6afdf1..9494efc2f2 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1486,9 +1486,44 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
 
     const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast<const CVulkanPipelineCache*>(pipelineCache)->getInternalObject():VK_NULL_HANDLE;
     
+    struct ShaderModuleKey
+    {
+        const asset::IShader* shader;
+        std::string_view entryPoint;
+        bool operator==(const ShaderModuleKey& other) const = default;
+
+        struct HashFunction
+        {
+            size_t operator()(const ShaderModuleKey& key) const
+            {
+                size_t rowHash = std::hash<const asset::IShader*>()(key.shader);
+                size_t colHash = std::hash<std::string_view>()(key.entryPoint) << 1;
+                return rowHash ^ colHash;
+            }
+        };
+    };
     size_t maxShaderStages = 0;
     for (const auto& info : createInfos)
-        maxShaderStages += info.shaderGroups.getShaderCount();
+    {
+        core::unordered_set<ShaderModuleKey, ShaderModuleKey::HashFunction> shaderModules;
+        shaderModules.insert({ info.shaderGroups.raygen.shader, info.shaderGroups.raygen.entryPoint });
+        for (const auto& miss : info.shaderGroups.misses)
+        {
+            shaderModules.insert({ miss.shader, miss.entryPoint });
+        }
+        for (const auto& hit : info.shaderGroups.hits)
+        {
+            shaderModules.insert({ hit.closestHit.shader, hit.closestHit.entryPoint });
+            shaderModules.insert({ hit.anyHit.shader, hit.anyHit.entryPoint });
+            shaderModules.insert({ hit.intersection.shader, hit.intersection.entryPoint });
+        }
+        for (const auto& callable : info.shaderGroups.callables)
+        {
+            shaderModules.insert({ callable.shader, callable.entryPoint });
+        }
+
+        maxShaderStages += shaderModules.size();
+    }
     size_t maxShaderGroups = 0;
     for (const auto& info : createInfos)
         maxShaderGroups += info.shaderGroups.getShaderGroupCount();
@@ -1516,27 +1551,11 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
 
     for (const auto& info : createInfos)
     {
-        struct VkShaderStageKey
-        {
-          const asset::IShader* shader;
-          std::string_view entryPoint;
-          bool operator==(const VkShaderStageKey& other) const = default;
-
-          struct HashFunction
-          {
-            size_t operator()(const VkShaderStageKey& key) const
-            {
-              size_t rowHash = std::hash<const asset::IShader*>()(key.shader);
-              size_t colHash = std::hash<std::string_view>()(key.entryPoint) << 1;
-              return rowHash ^ colHash;
-            }
-          };
-        };
 
-        core::unordered_map<VkShaderStageKey, uint32_t, VkShaderStageKey::HashFunction> shaderIndexes;
+        core::unordered_map<ShaderModuleKey, uint32_t, ShaderModuleKey::HashFunction> shaderIndexes;
         auto getVkShaderIndex = [&](const IGPUPipelineBase::SShaderSpecInfo& spec)
         {
-          const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint };
+          const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint };
           const auto index = key.shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[key];
           return index;
         };
@@ -1572,7 +1591,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
         auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage)
         {
             if (!spec.shader) return;
-            const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint };
+            const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint };
             if (shaderIndexes.find(key) == shaderIndexes.end())
             {
                 shaderIndexes.insert({ key , std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage)});

From e6d8727b904f1f10a0aadb82b01ce35487f34953 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 17 Jun 2025 15:59:06 +0200
Subject: [PATCH 344/346] update submodule pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index e30938c261..95d8f78465 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233
+Subproject commit 95d8f78465e100bb3a926cea412c21891c800b9d

From 6fea3e5ca08d69303ba873166cbb60c7268ba18f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 22:07:52 +0700
Subject: [PATCH 345/346] Add agrressive dce pass to remove type and remove
 multiple round of branch elim

---
 src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
index 361f5d3cfa..36d76eaf93 100644
--- a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
+++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
@@ -14,12 +14,11 @@ ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer()
 {
     // Multiple dead branch and dead function elimination because the first entry point removal might result to dead branch. Then the dead branch might result to dead function. Then, the dead function might result to dead branch and so on.
     constexpr auto optimizationPasses = std::array{
-        ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
-        ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,
         ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
         ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,
         ISPIRVOptimizer::EOP_ELIM_DEAD_VARIABLES,
         ISPIRVOptimizer::EOP_ELIM_DEAD_CONSTANTS,
+        ISPIRVOptimizer::EOP_AGGRESSIVE_DCE,
         ISPIRVOptimizer::EOP_ELIM_DEAD_MEMBERS,
         ISPIRVOptimizer::EOP_TRIM_CAPABILITIES,
     };

From 0aa03c70861118bdefc9eae9c647a58212e68340 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 17 Jun 2025 22:08:37 +0700
Subject: [PATCH 346/346] Remove comment

---
 src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
index 36d76eaf93..de78d2b162 100644
--- a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
+++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
@@ -12,7 +12,6 @@ static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSA
 
 ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer()
 {
-    // Multiple dead branch and dead function elimination because the first entry point removal might result to dead branch. Then the dead branch might result to dead function. Then, the dead function might result to dead branch and so on.
     constexpr auto optimizationPasses = std::array{
         ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
         ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,